bmwl root Cebtenzzre ggerganov Cebtenzzre commited on
Commit
7c952d2
·
unverified ·
1 Parent(s): 1f047ca

ggml : add numa options (llama/5377)

Browse files

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <[email protected]>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from

@rankaiyx
, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <[email protected]>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <[email protected]>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <[email protected]>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <[email protected]>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <[email protected]>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <[email protected]>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <[email protected]>
Co-authored-by: Jared Van Bortel <[email protected]>
Co-authored-by: Georgi Gerganov <[email protected]>
Co-authored-by: Jared Van Bortel <[email protected]>

Files changed (2) hide show
  1. ggml.c +67 -13
  2. ggml.h +11 -1
ggml.c CHANGED
@@ -1954,9 +1954,16 @@ struct ggml_numa_node {
1954
  };
1955
 
1956
  struct ggml_numa_nodes {
 
1957
  struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
1958
  uint32_t n_nodes;
1959
  uint32_t total_cpus; // hardware threads on system
 
 
 
 
 
 
1960
  };
1961
 
1962
  //
@@ -1990,7 +1997,22 @@ inline static void ggml_critical_section_end(void) {
1990
  atomic_fetch_sub(&g_state_barrier, 1);
1991
  }
1992
 
1993
- void ggml_numa_init(void) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1994
  if (g_state.numa.n_nodes > 0) {
1995
  fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
1996
 
@@ -2002,6 +2024,13 @@ void ggml_numa_init(void) {
2002
  char path[256];
2003
  int rv;
2004
 
 
 
 
 
 
 
 
2005
  // enumerate nodes
2006
  while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
2007
  rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
@@ -2020,11 +2049,17 @@ void ggml_numa_init(void) {
2020
 
2021
  GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
2022
 
2023
- if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
 
 
 
 
2024
  g_state.numa.n_nodes = 0;
2025
  return;
2026
  }
2027
 
 
 
2028
  for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
2029
  struct ggml_numa_node * node = &g_state.numa.nodes[n];
2030
  GGML_PRINT_DEBUG("CPUs on node %u:", n);
@@ -16638,26 +16673,46 @@ typedef pthread_t ggml_thread_t;
16638
 
16639
  // Android's libc implementation "bionic" does not support setting affinity
16640
  #if defined(__linux__) && !defined(__BIONIC__)
16641
- static void set_numa_thread_affinity(int thread_n, int n_threads) {
16642
  if (!ggml_is_numa()) {
16643
  return;
16644
  }
16645
 
16646
- // run thread on node_num thread_n / (threads per node)
16647
- const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
16648
- struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
16649
  size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16651
  cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16652
  CPU_ZERO_S(setsize, cpus);
16653
  for (size_t i = 0; i < node->n_cpus; ++i) {
16654
  CPU_SET_S(node->cpus[i], setsize, cpus);
16655
  }
16656
 
16657
- int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16658
  if (rv) {
16659
- fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16660
- strerror(rv));
16661
  }
16662
 
16663
  CPU_FREE(cpus);
@@ -16678,8 +16733,7 @@ static void clear_numa_thread_affinity(void) {
16678
 
16679
  int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16680
  if (rv) {
16681
- fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16682
- strerror(rv));
16683
  }
16684
 
16685
  CPU_FREE(cpus);
@@ -16687,7 +16741,7 @@ static void clear_numa_thread_affinity(void) {
16687
  #else
16688
  // TODO: Windows etc.
16689
  // (the linux implementation may also work on BSD, someone should test)
16690
- static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
16691
  static void clear_numa_thread_affinity(void) {}
16692
  #endif
16693
 
@@ -16987,7 +17041,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16987
 
16988
  const int n_threads = state->shared->n_threads;
16989
 
16990
- set_numa_thread_affinity(state->ith, n_threads);
16991
 
16992
  int node_n = -1;
16993
  int task_phase = GGML_TASK_FINALIZE;
 
1954
  };
1955
 
1956
  struct ggml_numa_nodes {
1957
+ enum ggml_numa_strategy numa_strategy;
1958
  struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
1959
  uint32_t n_nodes;
1960
  uint32_t total_cpus; // hardware threads on system
1961
+ uint32_t current_node; // node on which main process is execting
1962
+ #ifdef __linux__
1963
+ cpu_set_t cpuset; // cpuset from numactl
1964
+ #else
1965
+ uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
1966
+ #endif
1967
  };
1968
 
1969
  //
 
1997
  atomic_fetch_sub(&g_state_barrier, 1);
1998
  }
1999
 
2000
+ #ifdef __linux__
2001
+ static cpu_set_t ggml_get_numa_affinity(void) {
2002
+ cpu_set_t cpuset;
2003
+ pthread_t thread;
2004
+ thread = pthread_self();
2005
+ CPU_ZERO(&cpuset);
2006
+ pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
2007
+ return cpuset;
2008
+ }
2009
+ #else
2010
+ static uint32_t ggml_get_numa_affinity(void) {
2011
+ return 0; // no NUMA support
2012
+ }
2013
+ #endif
2014
+
2015
+ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
2016
  if (g_state.numa.n_nodes > 0) {
2017
  fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
2018
 
 
2024
  char path[256];
2025
  int rv;
2026
 
2027
+ // set numa scheme
2028
+ g_state.numa.numa_strategy = numa_flag;
2029
+
2030
+ GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
2031
+
2032
+ g_state.numa.cpuset = ggml_get_numa_affinity();
2033
+
2034
  // enumerate nodes
2035
  while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
2036
  rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
 
2049
 
2050
  GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
2051
 
2052
+ // figure out which node we're on
2053
+ uint current_cpu;
2054
+ int getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
2055
+
2056
+ if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
2057
  g_state.numa.n_nodes = 0;
2058
  return;
2059
  }
2060
 
2061
+ GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
2062
+
2063
  for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
2064
  struct ggml_numa_node * node = &g_state.numa.nodes[n];
2065
  GGML_PRINT_DEBUG("CPUs on node %u:", n);
 
16673
 
16674
  // Android's libc implementation "bionic" does not support setting affinity
16675
  #if defined(__linux__) && !defined(__BIONIC__)
16676
+ static void set_numa_thread_affinity(int thread_n) {
16677
  if (!ggml_is_numa()) {
16678
  return;
16679
  }
16680
 
16681
+ int node_num;
16682
+ int rv;
 
16683
  size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16684
 
16685
+ switch(g_state.numa.numa_strategy) {
16686
+ case GGML_NUMA_STRATEGY_DISTRIBUTE:
16687
+ // run thread on node_num thread_n / (threads per node)
16688
+ node_num = thread_n % g_state.numa.n_nodes;
16689
+ break;
16690
+ case GGML_NUMA_STRATEGY_ISOLATE:
16691
+ // run thread on current_node
16692
+ node_num = g_state.numa.current_node;
16693
+ break;
16694
+ case GGML_NUMA_STRATEGY_NUMACTL:
16695
+ // use the cpuset that numactl gave us
16696
+ rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
16697
+ if (rv) {
16698
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
16699
+ }
16700
+ return;
16701
+ default:
16702
+ return;
16703
+ }
16704
+
16705
+ struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
16706
+
16707
  cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16708
  CPU_ZERO_S(setsize, cpus);
16709
  for (size_t i = 0; i < node->n_cpus; ++i) {
16710
  CPU_SET_S(node->cpus[i], setsize, cpus);
16711
  }
16712
 
16713
+ rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16714
  if (rv) {
16715
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
 
16716
  }
16717
 
16718
  CPU_FREE(cpus);
 
16733
 
16734
  int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16735
  if (rv) {
16736
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
 
16737
  }
16738
 
16739
  CPU_FREE(cpus);
 
16741
  #else
16742
  // TODO: Windows etc.
16743
  // (the linux implementation may also work on BSD, someone should test)
16744
+ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
16745
  static void clear_numa_thread_affinity(void) {}
16746
  #endif
16747
 
 
17041
 
17042
  const int n_threads = state->shared->n_threads;
17043
 
17044
+ set_numa_thread_affinity(state->ith);
17045
 
17046
  int node_n = -1;
17047
  int task_phase = GGML_TASK_FINALIZE;
ggml.h CHANGED
@@ -658,6 +658,16 @@ extern "C" {
658
  void * wdata;
659
  };
660
 
 
 
 
 
 
 
 
 
 
 
661
  // misc
662
 
663
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@@ -668,7 +678,7 @@ extern "C" {
668
 
669
  GGML_API void ggml_print_backtrace(void);
670
 
671
- GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
672
  GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
673
 
674
  GGML_API void ggml_print_object (const struct ggml_object * obj);
 
658
  void * wdata;
659
  };
660
 
661
+ // numa strategies
662
+ enum ggml_numa_strategy {
663
+ GGML_NUMA_STRATEGY_DISABLED = 0,
664
+ GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
665
+ GGML_NUMA_STRATEGY_ISOLATE = 2,
666
+ GGML_NUMA_STRATEGY_NUMACTL = 3,
667
+ GGML_NUMA_STRATEGY_MIRROR = 4,
668
+ GGML_NUMA_STRATEGY_COUNT
669
+ };
670
+
671
  // misc
672
 
673
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
 
678
 
679
  GGML_API void ggml_print_backtrace(void);
680
 
681
+ GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
682
  GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
683
 
684
  GGML_API void ggml_print_object (const struct ggml_object * obj);