Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
SUSE:SLE-12-SP4:GA
irqbalance
0001-Teach-irqbalance-about-Intel-CoD.patch
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File 0001-Teach-irqbalance-about-Intel-CoD.patch of Package irqbalance
From a3feae7b68df0a303bb13a305d951620ef2061af Mon Sep 17 00:00:00 2001 From: Krister Johansen <kjlx@templeofstupid.com> Date: Mon, 10 Jul 2017 17:00:27 -0700 Subject: Teach irqbalance about Intel CoD. This originally surfaced as a bug in placing network interrupts. In the case that this submitter observed, the NIC card was in NUMA domain 0, but each RSS interrupt was getting an affinity list for all CPUs in the domain. The expected behavior is for a single cpu to be chosen when attempting to fan out NIC interrupts. Due to other implementation details of interrupt placement, this effectively caused all interrupt mappings for this NIC to end up on CPU 0. The bug turns out ot have been caused by Intel Cluster on Die breaking an assumption in irqbalance about the design of the component hierarchy. The CoD topology allows a CPU package to belong to more than one NUMA node, which is not expected. The RCA was that when the second NUMA node was wired up to the existing physical package, it overwrote the mappings that were placed there by the first. This patch attempts to solve that problem by permitting a package to have multiple NUMA nodes. The CPU component hierarchy is preserved, in case other parts of the code depend upon walking it. When a CoD topology is detected, the NUMA node -> CPU component mapping is moved down a level, so that the nodes point to the first level where the affinity becomes distinct. In practice, this has been observed to be the LLC. A quick illustration (now, with COD, it looks like this): +-----------+ | NUMA Node | | 0 | +-----------+ | | +-------+ \|/ / | CPU 0 | +---------+ +-------+ | Cache 0 | +---------+ +-------+ / \ | CPU 1 | +-----------+ +-------+ | Package 0 | +-----------+ +-------+ \ / | CPU 2 | +---------+ +-------+ | Cache 1 | +---------+ ^ \ +-------+ | | CPU 3 | | +-------+ +-----------+ | NUMA Node | | 1 | +-----------+ Whereas, previously only NUMA Node 1 would end up pointing to package 0. The topology should not be different on platforms that do not enable CoD. Signed-off-by: Krister Johansen <kjlx@templeofstupid.com> (cherry picked from commit 7bc1244fbf4cbc41cfdaf5583e2bcf6f50d53c88) --- cputree.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++-------------- irqbalance.c | 9 +++++-- irqbalance.h | 17 ++----------- numa.c | 28 ++++++++++++++++----- types.h | 1 + 5 files changed, 94 insertions(+), 41 deletions(-) --- a/cputree.c +++ b/cputree.c @@ -112,8 +112,32 @@ out: log(TO_CONSOLE, LOG_INFO, "Adaptive-ticks CPUs: %s\n", buffer); } -static struct topo_obj* add_cache_domain_to_package(struct topo_obj *cache, - int packageid, cpumask_t package_mask) +static void add_numa_node_to_topo_obj(struct topo_obj *obj, int nodeid) +{ + GList *entry; + struct topo_obj *node; + struct topo_obj *cand_node; + + node = get_numa_node(nodeid); + if (!node || node->number == -1) + return; + + entry = g_list_first(obj->numa_nodes); + while (entry) { + cand_node = entry->data; + if (cand_node == node) + break; + entry = g_list_next(entry); + } + + if (!entry) + obj->numa_nodes = g_list_append(obj->numa_nodes, node); +} + +static struct topo_obj* add_cache_domain_to_package(struct topo_obj *cache, + int packageid, + cpumask_t package_mask, + int nodeid) { GList *entry; struct topo_obj *package; @@ -156,10 +180,14 @@ static struct topo_obj* add_cache_domain cache->parent = package; } + if (nodeid > -1) + add_numa_node_to_topo_obj(package, nodeid); + return package; } static struct topo_obj* add_cpu_to_cache_domain(struct topo_obj *cpu, - cpumask_t cache_mask) + cpumask_t cache_mask, + int nodeid) { GList *entry; struct topo_obj *cache; @@ -199,6 +227,9 @@ static struct topo_obj* add_cpu_to_cache cpu->parent = (struct topo_obj *)cache; } + if (nodeid > -1) + add_numa_node_to_topo_obj(cache, nodeid); + return cache; } @@ -335,9 +366,9 @@ static void do_one_cpu(char *path) cpus_and(cache_mask, cache_mask, unbanned_cpus); cpus_and(package_mask, package_mask, unbanned_cpus); - cache = add_cpu_to_cache_domain(cpu, cache_mask); - package = add_cache_domain_to_package(cache, packageid, package_mask); - add_package_to_node(package, nodeid); + cache = add_cpu_to_cache_domain(cpu, cache_mask, nodeid); + package = add_cache_domain_to_package(cache, packageid, package_mask, + nodeid); cpu->obj_type_list = &cpus; cpus = g_list_append(cpus, cpu); @@ -359,12 +390,18 @@ static void dump_irq(struct irq_info *in free(indent); } +static void dump_numa_node_num(struct topo_obj *p, void *data __attribute__((unused))) +{ + log(TO_CONSOLE, LOG_INFO, "%d ", p->number); +} + static void dump_balance_obj(struct topo_obj *d, void *data __attribute__((unused))) { struct topo_obj *c = (struct topo_obj *)d; - log(TO_CONSOLE, LOG_INFO, "%s%s%s%sCPU number %i numa_node is %d (load %lu)\n", - log_indent, log_indent, log_indent, log_indent, - c->number, cpu_numa_node(c)->number , (unsigned long)c->load); + log(TO_CONSOLE, LOG_INFO, "%s%s%s%sCPU number %i numa_node is ", + log_indent, log_indent, log_indent, log_indent, c->number); + for_each_object(cpu_numa_node(c), dump_numa_node_num, NULL); + log(TO_CONSOLE, LOG_INFO, "(load %lu)\n", (unsigned long)c->load); if (c->interrupts) for_each_irq(c->interrupts, dump_irq, (void *)18); } @@ -373,9 +410,11 @@ static void dump_cache_domain(struct top { char *buffer = data; cpumask_scnprintf(buffer, 4095, d->mask); - log(TO_CONSOLE, LOG_INFO, "%s%sCache domain %i: numa_node is %d cpu mask is %s (load %lu) \n", - log_indent, log_indent, - d->number, cache_domain_numa_node(d)->number, buffer, (unsigned long)d->load); + log(TO_CONSOLE, LOG_INFO, "%s%sCache domain %i: numa_node is ", + log_indent, log_indent, d->number); + for_each_object(d->numa_nodes, dump_numa_node_num, NULL); + log(TO_CONSOLE, LOG_INFO, "cpu mask is %s (load %lu) \n", buffer, + (unsigned long)d->load); if (d->children) for_each_object(d->children, dump_balance_obj, NULL); if (g_list_length(d->interrupts) > 0) @@ -386,8 +425,10 @@ static void dump_package(struct topo_obj { char *buffer = data; cpumask_scnprintf(buffer, 4096, d->mask); - log(TO_CONSOLE, LOG_INFO, "Package %i: numa_node is %d cpu mask is %s (load %lu)\n", - d->number, package_numa_node(d)->number, buffer, (unsigned long)d->load); + log(TO_CONSOLE, LOG_INFO, "Package %i: numa_node ", d->number); + for_each_object(d->numa_nodes, dump_numa_node_num, NULL); + log(TO_CONSOLE, LOG_INFO, "cpu mask is %s (load %lu)\n", + buffer, (unsigned long)d->load); if (d->children) for_each_object(d->children, dump_cache_domain, buffer); if (g_list_length(d->interrupts) > 0) @@ -439,9 +480,9 @@ void parse_cpu_tree(void) char pad; entry = readdir(dir); /* - * We only want to count real cpus, not cpufreq and - * cpuidle - */ + * We only want to count real cpus, not cpufreq and + * cpuidle + */ if (entry && sscanf(entry->d_name, "cpu%d%c", &num, &pad) == 1 && !strchr(entry->d_name, ' ')) { @@ -450,7 +491,8 @@ void parse_cpu_tree(void) do_one_cpu(new_path); } } while (entry); - closedir(dir); + closedir(dir); + for_each_object(packages, connect_cpu_mem_topo, NULL); if (debug_mode) dump_tree(); @@ -474,6 +516,7 @@ void clear_cpu_tree(void) package = item->data; g_list_free(package->children); g_list_free(package->interrupts); + g_list_free(package->numa_nodes); free(package); packages = g_list_delete_link(packages, item); } @@ -484,6 +527,7 @@ void clear_cpu_tree(void) cache_domain = item->data; g_list_free(cache_domain->children); g_list_free(cache_domain->interrupts); + g_list_free(cache_domain->numa_nodes); free(cache_domain); cache_domains = g_list_delete_link(cache_domains, item); } --- a/irqbalance.c +++ b/irqbalance.c @@ -200,9 +200,14 @@ static void parse_command_line(int argc, #endif /* - * This builds our object tree. The Heirarchy is pretty straightforward + * This builds our object tree. The Heirarchy is typically pretty + * straightforward. * At the top are numa_nodes - * All CPU packages belong to a single numa_node + * CPU packages belong to a single numa_node, unless the cache domains are in + * separate nodes. In that case, the cache domain's parent is the package, but + * the numa nodes point to the cache domains instead of the package as their + * children. This allows us to maintain the CPU hierarchy while adjusting for + * alternate memory topologies that are present on recent processor. * All Cache domains belong to a CPU package * All CPU cores belong to a cache domain * --- a/irqbalance.h +++ b/irqbalance.h @@ -89,26 +89,13 @@ extern long HZ; extern void build_numa_node_list(void); extern void free_numa_node_list(void); extern void dump_numa_node_info(struct topo_obj *node, void *data); -extern void add_package_to_node(struct topo_obj *p, int nodeid); +extern void connect_cpu_mem_topo(struct topo_obj *p, void *data); extern struct topo_obj *get_numa_node(int nodeid); /* - * Package functions - */ -#define package_numa_node(p) ((p)->parent) - -/* - * cache_domain functions - */ -#define cache_domain_package(c) ((c)->parent) -#define cache_domain_numa_node(c) (package_numa_node(cache_domain_package((c)))) - -/* * cpu core functions */ -#define cpu_cache_domain(cpu) ((cpu)->parent) -#define cpu_package(cpu) (cache_domain_package(cpu_cache_domain((cpu)))) -#define cpu_numa_node(cpu) (package_numa_node(cache_domain_package(cpu_cache_domain((cpu))))) +#define cpu_numa_node(cpu) ((cpu)->parent->numa_nodes) extern struct topo_obj *find_cpu_core(int cpunr); extern int get_cpu_count(void); --- a/numa.c +++ b/numa.c @@ -146,22 +146,38 @@ static gint compare_node(gconstpointer a return (ai->number == bi->number) ? 0 : 1; } -void add_package_to_node(struct topo_obj *p, int nodeid) +void connect_cpu_mem_topo(struct topo_obj *p, void *data __attribute__((unused))) { + GList *entry; struct topo_obj *node; + struct topo_obj *lchild; + int len; - node = get_numa_node(nodeid); + len = g_list_length(p->numa_nodes); - if (!node) { - log(TO_CONSOLE, LOG_INFO, "Could not find numa node for node id %d\n", nodeid); + if (len == 0) { + return; + } else if (len > 1) { + for_each_object(p->children, connect_cpu_mem_topo, NULL); return; } + entry = g_list_first(p->numa_nodes); + node = entry->data; - if (!p->parent) { - node->children = g_list_append(node->children, p); + if (p->obj_type == OBJ_TYPE_PACKAGE && !p->parent) p->parent = node; + + entry = g_list_first(node->children); + while (entry) { + lchild = entry->data; + if (lchild == p) + break; + entry = g_list_next(entry); } + + if (!entry) + node->children = g_list_append(node->children, p); } void dump_numa_node_info(struct topo_obj *d, void *unused __attribute__((unused))) --- a/types.h +++ b/types.h @@ -54,6 +54,7 @@ struct topo_obj { GList *interrupts; struct topo_obj *parent; GList *children; + GList *numa_nodes; GList **obj_type_list; };
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor