Spaces:
Sleeping
ggml : add numa options (llama/5377)
Browse files* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h
* Reverted Makefile
* Fixed include
* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables
* removed trailing whitespace
* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h
* Reverting Makefile
* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet
* Removing MIRROR_MODE code for this PR
* Removing last bit of MIRROR_MODE code for this PR
* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static
* Fixed lingering init_llama_backend() bool calls in tests and examples
* Remote enum llama_numa_strategies
* Revert bad merge with dynatemp flags
* add missing enum ggml_numa_strategies declaration and revert sync problem with master
* add missing enum ggml_numa_strategies declaration
* fixed ggml_init_numa variable
* Update ggml.h
Co-authored-by: Jared Van Bortel <[email protected]>
* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from
@rankaiyx
, fix a spelling mistake and un-merge some bad merges
* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples
* Fix up some boolean vs enum comparisons
* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype
* Update ggml.h
Align enum values
Co-authored-by: Georgi Gerganov <[email protected]>
* Update ggml.c
Remove whitespace
Co-authored-by: Georgi Gerganov <[email protected]>
* Update ggml.c
align paremeters
Co-authored-by: Georgi Gerganov <[email protected]>
* Update examples/server/server.cpp
remove whitespace and align brace
Co-authored-by: Georgi Gerganov <[email protected]>
* Update common/common.cpp
Remove whitespace and align brace
Co-authored-by: Georgi Gerganov <[email protected]>
* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example
* Update ggml.c
simplified return for platforms without NUMA support
Co-authored-by: Jared Van Bortel <[email protected]>
* removed redundant else from cli argument processing of --numa
* whitespace
---------
Co-authored-by: root <[email protected]>
Co-authored-by: Jared Van Bortel <[email protected]>
Co-authored-by: Georgi Gerganov <[email protected]>
Co-authored-by: Jared Van Bortel <[email protected]>
|
@@ -1954,9 +1954,16 @@ struct ggml_numa_node {
|
|
| 1954 |
};
|
| 1955 |
|
| 1956 |
struct ggml_numa_nodes {
|
|
|
|
| 1957 |
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
| 1958 |
uint32_t n_nodes;
|
| 1959 |
uint32_t total_cpus; // hardware threads on system
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1960 |
};
|
| 1961 |
|
| 1962 |
//
|
|
@@ -1990,7 +1997,22 @@ inline static void ggml_critical_section_end(void) {
|
|
| 1990 |
atomic_fetch_sub(&g_state_barrier, 1);
|
| 1991 |
}
|
| 1992 |
|
| 1993 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1994 |
if (g_state.numa.n_nodes > 0) {
|
| 1995 |
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
|
| 1996 |
|
|
@@ -2002,6 +2024,13 @@ void ggml_numa_init(void) {
|
|
| 2002 |
char path[256];
|
| 2003 |
int rv;
|
| 2004 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2005 |
// enumerate nodes
|
| 2006 |
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
|
| 2007 |
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
|
|
@@ -2020,11 +2049,17 @@ void ggml_numa_init(void) {
|
|
| 2020 |
|
| 2021 |
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
|
| 2022 |
|
| 2023 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2024 |
g_state.numa.n_nodes = 0;
|
| 2025 |
return;
|
| 2026 |
}
|
| 2027 |
|
|
|
|
|
|
|
| 2028 |
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
|
| 2029 |
struct ggml_numa_node * node = &g_state.numa.nodes[n];
|
| 2030 |
GGML_PRINT_DEBUG("CPUs on node %u:", n);
|
|
@@ -16638,26 +16673,46 @@ typedef pthread_t ggml_thread_t;
|
|
| 16638 |
|
| 16639 |
// Android's libc implementation "bionic" does not support setting affinity
|
| 16640 |
#if defined(__linux__) && !defined(__BIONIC__)
|
| 16641 |
-
static void set_numa_thread_affinity(int thread_n
|
| 16642 |
if (!ggml_is_numa()) {
|
| 16643 |
return;
|
| 16644 |
}
|
| 16645 |
|
| 16646 |
-
|
| 16647 |
-
|
| 16648 |
-
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
|
| 16649 |
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
| 16650 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16651 |
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
| 16652 |
CPU_ZERO_S(setsize, cpus);
|
| 16653 |
for (size_t i = 0; i < node->n_cpus; ++i) {
|
| 16654 |
CPU_SET_S(node->cpus[i], setsize, cpus);
|
| 16655 |
}
|
| 16656 |
|
| 16657 |
-
|
| 16658 |
if (rv) {
|
| 16659 |
-
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
| 16660 |
-
strerror(rv));
|
| 16661 |
}
|
| 16662 |
|
| 16663 |
CPU_FREE(cpus);
|
|
@@ -16678,8 +16733,7 @@ static void clear_numa_thread_affinity(void) {
|
|
| 16678 |
|
| 16679 |
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
| 16680 |
if (rv) {
|
| 16681 |
-
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
| 16682 |
-
strerror(rv));
|
| 16683 |
}
|
| 16684 |
|
| 16685 |
CPU_FREE(cpus);
|
|
@@ -16687,7 +16741,7 @@ static void clear_numa_thread_affinity(void) {
|
|
| 16687 |
#else
|
| 16688 |
// TODO: Windows etc.
|
| 16689 |
// (the linux implementation may also work on BSD, someone should test)
|
| 16690 |
-
static void set_numa_thread_affinity(int thread_n
|
| 16691 |
static void clear_numa_thread_affinity(void) {}
|
| 16692 |
#endif
|
| 16693 |
|
|
@@ -16987,7 +17041,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 16987 |
|
| 16988 |
const int n_threads = state->shared->n_threads;
|
| 16989 |
|
| 16990 |
-
set_numa_thread_affinity(state->ith
|
| 16991 |
|
| 16992 |
int node_n = -1;
|
| 16993 |
int task_phase = GGML_TASK_FINALIZE;
|
|
|
|
| 1954 |
};
|
| 1955 |
|
| 1956 |
struct ggml_numa_nodes {
|
| 1957 |
+
enum ggml_numa_strategy numa_strategy;
|
| 1958 |
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
| 1959 |
uint32_t n_nodes;
|
| 1960 |
uint32_t total_cpus; // hardware threads on system
|
| 1961 |
+
uint32_t current_node; // node on which main process is execting
|
| 1962 |
+
#ifdef __linux__
|
| 1963 |
+
cpu_set_t cpuset; // cpuset from numactl
|
| 1964 |
+
#else
|
| 1965 |
+
uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
|
| 1966 |
+
#endif
|
| 1967 |
};
|
| 1968 |
|
| 1969 |
//
|
|
|
|
| 1997 |
atomic_fetch_sub(&g_state_barrier, 1);
|
| 1998 |
}
|
| 1999 |
|
| 2000 |
+
#ifdef __linux__
|
| 2001 |
+
static cpu_set_t ggml_get_numa_affinity(void) {
|
| 2002 |
+
cpu_set_t cpuset;
|
| 2003 |
+
pthread_t thread;
|
| 2004 |
+
thread = pthread_self();
|
| 2005 |
+
CPU_ZERO(&cpuset);
|
| 2006 |
+
pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
|
| 2007 |
+
return cpuset;
|
| 2008 |
+
}
|
| 2009 |
+
#else
|
| 2010 |
+
static uint32_t ggml_get_numa_affinity(void) {
|
| 2011 |
+
return 0; // no NUMA support
|
| 2012 |
+
}
|
| 2013 |
+
#endif
|
| 2014 |
+
|
| 2015 |
+
void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
| 2016 |
if (g_state.numa.n_nodes > 0) {
|
| 2017 |
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
|
| 2018 |
|
|
|
|
| 2024 |
char path[256];
|
| 2025 |
int rv;
|
| 2026 |
|
| 2027 |
+
// set numa scheme
|
| 2028 |
+
g_state.numa.numa_strategy = numa_flag;
|
| 2029 |
+
|
| 2030 |
+
GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
|
| 2031 |
+
|
| 2032 |
+
g_state.numa.cpuset = ggml_get_numa_affinity();
|
| 2033 |
+
|
| 2034 |
// enumerate nodes
|
| 2035 |
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
|
| 2036 |
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
|
|
|
|
| 2049 |
|
| 2050 |
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
|
| 2051 |
|
| 2052 |
+
// figure out which node we're on
|
| 2053 |
+
uint current_cpu;
|
| 2054 |
+
int getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
|
| 2055 |
+
|
| 2056 |
+
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
|
| 2057 |
g_state.numa.n_nodes = 0;
|
| 2058 |
return;
|
| 2059 |
}
|
| 2060 |
|
| 2061 |
+
GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
|
| 2062 |
+
|
| 2063 |
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
|
| 2064 |
struct ggml_numa_node * node = &g_state.numa.nodes[n];
|
| 2065 |
GGML_PRINT_DEBUG("CPUs on node %u:", n);
|
|
|
|
| 16673 |
|
| 16674 |
// Android's libc implementation "bionic" does not support setting affinity
|
| 16675 |
#if defined(__linux__) && !defined(__BIONIC__)
|
| 16676 |
+
static void set_numa_thread_affinity(int thread_n) {
|
| 16677 |
if (!ggml_is_numa()) {
|
| 16678 |
return;
|
| 16679 |
}
|
| 16680 |
|
| 16681 |
+
int node_num;
|
| 16682 |
+
int rv;
|
|
|
|
| 16683 |
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
| 16684 |
|
| 16685 |
+
switch(g_state.numa.numa_strategy) {
|
| 16686 |
+
case GGML_NUMA_STRATEGY_DISTRIBUTE:
|
| 16687 |
+
// run thread on node_num thread_n / (threads per node)
|
| 16688 |
+
node_num = thread_n % g_state.numa.n_nodes;
|
| 16689 |
+
break;
|
| 16690 |
+
case GGML_NUMA_STRATEGY_ISOLATE:
|
| 16691 |
+
// run thread on current_node
|
| 16692 |
+
node_num = g_state.numa.current_node;
|
| 16693 |
+
break;
|
| 16694 |
+
case GGML_NUMA_STRATEGY_NUMACTL:
|
| 16695 |
+
// use the cpuset that numactl gave us
|
| 16696 |
+
rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
|
| 16697 |
+
if (rv) {
|
| 16698 |
+
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
|
| 16699 |
+
}
|
| 16700 |
+
return;
|
| 16701 |
+
default:
|
| 16702 |
+
return;
|
| 16703 |
+
}
|
| 16704 |
+
|
| 16705 |
+
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
|
| 16706 |
+
|
| 16707 |
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
| 16708 |
CPU_ZERO_S(setsize, cpus);
|
| 16709 |
for (size_t i = 0; i < node->n_cpus; ++i) {
|
| 16710 |
CPU_SET_S(node->cpus[i], setsize, cpus);
|
| 16711 |
}
|
| 16712 |
|
| 16713 |
+
rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
| 16714 |
if (rv) {
|
| 16715 |
+
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
|
|
|
|
| 16716 |
}
|
| 16717 |
|
| 16718 |
CPU_FREE(cpus);
|
|
|
|
| 16733 |
|
| 16734 |
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
| 16735 |
if (rv) {
|
| 16736 |
+
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
|
|
|
|
| 16737 |
}
|
| 16738 |
|
| 16739 |
CPU_FREE(cpus);
|
|
|
|
| 16741 |
#else
|
| 16742 |
// TODO: Windows etc.
|
| 16743 |
// (the linux implementation may also work on BSD, someone should test)
|
| 16744 |
+
static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
|
| 16745 |
static void clear_numa_thread_affinity(void) {}
|
| 16746 |
#endif
|
| 16747 |
|
|
|
|
| 17041 |
|
| 17042 |
const int n_threads = state->shared->n_threads;
|
| 17043 |
|
| 17044 |
+
set_numa_thread_affinity(state->ith);
|
| 17045 |
|
| 17046 |
int node_n = -1;
|
| 17047 |
int task_phase = GGML_TASK_FINALIZE;
|
|
@@ -658,6 +658,16 @@ extern "C" {
|
|
| 658 |
void * wdata;
|
| 659 |
};
|
| 660 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
// misc
|
| 662 |
|
| 663 |
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
|
@@ -668,7 +678,7 @@ extern "C" {
|
|
| 668 |
|
| 669 |
GGML_API void ggml_print_backtrace(void);
|
| 670 |
|
| 671 |
-
GGML_API void ggml_numa_init(
|
| 672 |
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
| 673 |
|
| 674 |
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
|
|
|
| 658 |
void * wdata;
|
| 659 |
};
|
| 660 |
|
| 661 |
+
// numa strategies
|
| 662 |
+
enum ggml_numa_strategy {
|
| 663 |
+
GGML_NUMA_STRATEGY_DISABLED = 0,
|
| 664 |
+
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
| 665 |
+
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
| 666 |
+
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
| 667 |
+
GGML_NUMA_STRATEGY_MIRROR = 4,
|
| 668 |
+
GGML_NUMA_STRATEGY_COUNT
|
| 669 |
+
};
|
| 670 |
+
|
| 671 |
// misc
|
| 672 |
|
| 673 |
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
|
|
|
| 678 |
|
| 679 |
GGML_API void ggml_print_backtrace(void);
|
| 680 |
|
| 681 |
+
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
| 682 |
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
| 683 |
|
| 684 |
GGML_API void ggml_print_object (const struct ggml_object * obj);
|