Re: [patch 00/18] SMP: Boot and CPU hotplug refactoring - Part 1
From: Suresh Siddha <hidden>
Date: 2012-05-03 23:39:22
Also in:
lkml
Subsystem:
exec & binfmt api, elf, generic include/asm header files, memory management - core, scheduler, the rest, x86 architecture (32-bit and 64-bit), x86 mm · Maintainers:
Kees Cook, Arnd Bergmann, Andrew Morton, David Hildenbrand, Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot, Linus Torvalds, Thomas Gleixner, Borislav Petkov, Dave Hansen, Andy Lutomirski
On Thu, 2012-05-03 at 11:41 +0200, Thomas Gleixner wrote:
On Fri, 20 Apr 2012, Suresh Siddha wrote:quoted
Also, do we really need the workqueue/kthreadd based allocation? Just like the percpu areas getting allocated for each possible cpu during boot, shouldn't we extend this to the per-cpu idle threads too? So something like the appended should be ok to?The idea is correct, there are just a few problems :)quoted
Signed-off-by: Suresh Siddha <redacted> ---diff --git a/kernel/cpu.c b/kernel/cpu.c index 05c46ba..a5144ab 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c@@ -303,10 +303,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) cpu_hotplug_begin(); - ret = smpboot_prepare(cpu); - if (ret) - goto out; -If we failed to allocate an idle_thread for this cpu in smp_init() then we unconditionally call __cpu_up() with a NULL pointer. That might surprise the arch code :) Aside of that, we now miss to reinitialize the idle thread. We call init_idle() once when we allocate the thread, but not after a cpu offline operation. That might leave stuff in weird state.
Second one slipped through and wasn't intentional. Anyways your modifications look good. While I am here, noticed that we could do 'node' aware idle task struct allocations. Appended the patch for this. Thanks. --- From: Suresh Siddha <redacted> Subject: idle: allocate percpu idle taks from the local node Use the arch specific early_cpu_to_node() to find out the local node for a given 'cpu' and use that info while allocating memory for that cpu's idle task. Signed-off-by: Suresh Siddha <redacted> --- arch/x86/include/asm/topology.h | 8 +++++--- arch/x86/mm/numa.c | 2 +- include/asm-generic/topology.h | 4 ++++ include/linux/kthread.h | 9 ++++++++- kernel/fork.c | 9 ++++++--- kernel/kthread.c | 8 +++----- 6 files changed, 27 insertions(+), 13 deletions(-)
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index b9676ae..bdbcee2 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h@@ -57,12 +57,12 @@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); extern int __cpu_to_node(int cpu); #define cpu_to_node __cpu_to_node -extern int early_cpu_to_node(int cpu); +extern int __early_cpu_to_node(int cpu); #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ /* Same function but used if called before per_cpu areas are setup */ -static inline int early_cpu_to_node(int cpu) +static inline int __early_cpu_to_node(int cpu) { return early_per_cpu(x86_cpu_to_node_map, cpu); }
@@ -144,7 +144,7 @@ static inline int numa_node_id(void) */ #define numa_node_id numa_node_id -static inline int early_cpu_to_node(int cpu) +static inline int __early_cpu_to_node(int cpu) { return 0; }
@@ -153,6 +153,8 @@ static inline void setup_node_to_cpumask_map(void) { } #endif +#define early_cpu_to_node(cpu) __early_cpu_to_node(cpu) + #include <asm-generic/topology.h> extern const struct cpumask *cpu_coregroup_mask(int cpu);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 19d3fa0..142738e 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c@@ -734,7 +734,7 @@ EXPORT_SYMBOL(__cpu_to_node); * Same function as cpu_to_node() but used if called before the * per_cpu areas are setup. */ -int early_cpu_to_node(int cpu) +int __early_cpu_to_node(int cpu) { if (early_per_cpu_ptr(x86_cpu_to_node_map)) return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
index fc824e2..9ace6da 100644
--- a/include/asm-generic/topology.h
+++ b/include/asm-generic/topology.h@@ -73,4 +73,8 @@ #endif /* !CONFIG_NUMA || !CONFIG_HAVE_MEMORYLESS_NODES */ +#ifndef early_cpu_to_node +#define early_cpu_to_node(cpu) ((void)(cpu), -1) +#endif + #endif /* _ASM_GENERIC_TOPOLOGY_H */
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 0714b24..c1f05e3 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h@@ -40,7 +40,7 @@ void *kthread_data(struct task_struct *k); int kthreadd(void *unused); extern struct task_struct *kthreadd_task; -extern int tsk_fork_get_node(struct task_struct *tsk); +extern int tsk_fork_get_node(struct task_struct *tsk, int idle_tsk); /* * Simple work processor based on kthread.
@@ -131,4 +131,11 @@ bool queue_kthread_work(struct kthread_worker *worker, void flush_kthread_work(struct kthread_work *work); void flush_kthread_worker(struct kthread_worker *worker); +static inline void set_fork_pref_node(int node) +{ +#ifdef CONFIG_NUMA + current->pref_node_fork = node; +#endif +} + #endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index ca9a384..108d566 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c@@ -253,12 +253,13 @@ int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, return 0; } -static struct task_struct *dup_task_struct(struct task_struct *orig) +static struct task_struct *dup_task_struct(struct task_struct *orig, + int idle_tsk) { struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; - int node = tsk_fork_get_node(orig); + int node = tsk_fork_get_node(orig, idle_tsk); int err; prepare_to_copy(orig);
@@ -1165,7 +1166,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; retval = -ENOMEM; - p = dup_task_struct(current); + p = dup_task_struct(current, pid == &init_struct_pid); if (!p) goto fork_out;
@@ -1531,6 +1532,8 @@ struct task_struct * __cpuinit fork_idle(int cpu) struct task_struct *task; struct pt_regs regs; + set_fork_pref_node(early_cpu_to_node(cpu)); + task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, &init_struct_pid, 0); if (!IS_ERR(task)) {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d3de63..3d74ab1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c@@ -125,10 +125,10 @@ static int kthread(void *_create) } /* called from do_fork() to get node information for about to be created task */ -int tsk_fork_get_node(struct task_struct *tsk) +int tsk_fork_get_node(struct task_struct *tsk, int idle_tsk) { #ifdef CONFIG_NUMA - if (tsk == kthreadd_task) + if (tsk == kthreadd_task || idle_tsk) return tsk->pref_node_fork; #endif return numa_node_id();
@@ -138,9 +138,7 @@ static void create_kthread(struct kthread_create_info *create) { int pid; -#ifdef CONFIG_NUMA - current->pref_node_fork = create->node; -#endif + set_fork_pref_node(create->node); /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) {