Binary files numa-sched-ref/ID and numa-sched/ID differ diff -urN numa-sched-ref/arch/alpha/config.in numa-sched/arch/alpha/config.in --- numa-sched-ref/arch/alpha/config.in Sat Dec 8 13:23:26 2001 +++ numa-sched/arch/alpha/config.in Sat Dec 8 13:23:50 2001 @@ -223,6 +223,9 @@ bool 'Discontiguous Memory Support' CONFIG_DISCONTIGMEM if [ "$CONFIG_DISCONTIGMEM" = "y" ]; then bool ' NUMA Support' CONFIG_NUMA + if [ "$CONFIG_NUMA" = "y" ]; then + bool ' NUMA Scheduler Support' CONFIG_NUMA_SCHED + fi fi fi diff -urN numa-sched-ref/arch/alpha/kernel/entry.S numa-sched/arch/alpha/kernel/entry.S --- numa-sched-ref/arch/alpha/kernel/entry.S Fri Nov 23 08:20:50 2001 +++ numa-sched/arch/alpha/kernel/entry.S Sat Dec 8 13:23:50 2001 @@ -35,7 +35,7 @@ #define TASK_EXEC_DOMAIN 32 #define TASK_NEED_RESCHED 40 #define TASK_PTRACE 48 -#define TASK_PROCESSOR 100 +#define TASK_PROCESSOR 84 /* * task flags (must match include/linux/sched.h): diff -urN numa-sched-ref/include/asm-alpha/mmzone.h numa-sched/include/asm-alpha/mmzone.h --- numa-sched-ref/include/asm-alpha/mmzone.h Sat May 26 04:03:47 2001 +++ numa-sched/include/asm-alpha/mmzone.h Sat Dec 8 13:23:50 2001 @@ -21,7 +21,7 @@ #ifdef NOTYET kern_vars_t kern_vars; #endif -#if defined(CONFIG_NUMA) && defined(CONFIG_NUMA_SCHED) +#ifdef CONFIG_NUMA_SCHED struct numa_schedule_data schedule_data; #endif } plat_pg_data_t; diff -urN numa-sched-ref/include/asm-alpha/timex.h numa-sched/include/asm-alpha/timex.h --- numa-sched-ref/include/asm-alpha/timex.h Tue Dec 29 22:56:15 1998 +++ numa-sched/include/asm-alpha/timex.h Sat Dec 8 13:23:50 2001 @@ -27,4 +27,8 @@ return ret; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa-sched-ref/include/asm-arm/timex.h numa-sched/include/asm-arm/timex.h --- numa-sched-ref/include/asm-arm/timex.h Thu Nov 16 15:37:33 2000 +++ numa-sched/include/asm-arm/timex.h Sat Dec 8 13:23:50 2001 @@ -23,4 +23,8 @@ return 0; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa-sched-ref/include/asm-cris/timex.h numa-sched/include/asm-cris/timex.h --- numa-sched-ref/include/asm-cris/timex.h Fri Nov 23 08:21:02 2001 +++ numa-sched/include/asm-cris/timex.h Sat Dec 8 13:23:50 2001 @@ -33,4 +33,8 @@ return 0; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa-sched-ref/include/asm-i386/timex.h numa-sched/include/asm-i386/timex.h --- numa-sched-ref/include/asm-i386/timex.h Mon Nov 19 18:19:00 2001 +++ numa-sched/include/asm-i386/timex.h Sat Dec 8 13:23:50 2001 @@ -47,4 +47,8 @@ extern unsigned long cpu_khz; +typedef cycles_t last_schedule_t; +#define get_last_schedule() ({ get_cycles(); }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa-sched-ref/include/asm-ia64/timex.h numa-sched/include/asm-ia64/timex.h --- numa-sched-ref/include/asm-ia64/timex.h Tue May 1 19:35:31 2001 +++ numa-sched/include/asm-ia64/timex.h Sat Dec 8 13:23:50 2001 @@ -21,4 +21,8 @@ return ret; } +typedef cycles_t last_schedule_t; +#define get_last_schedule() ({ get_cycles(); }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif /* _ASM_IA64_TIMEX_H */ diff -urN numa-sched-ref/include/asm-m68k/timex.h numa-sched/include/asm-m68k/timex.h --- numa-sched-ref/include/asm-m68k/timex.h Tue Jan 5 20:20:43 1999 +++ numa-sched/include/asm-m68k/timex.h Sat Dec 8 13:23:50 2001 @@ -19,4 +19,8 @@ return 0; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa-sched-ref/include/asm-mips/timex.h numa-sched/include/asm-mips/timex.h --- numa-sched-ref/include/asm-mips/timex.h Sat May 13 17:31:25 2000 +++ numa-sched/include/asm-mips/timex.h Sat Dec 8 13:23:50 2001 @@ -36,6 +36,11 @@ { return read_32bit_cp0_register(CP0_COUNT); } + +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif /* __KERNEL__ */ #endif /* __ASM_MIPS_TIMEX_H */ diff -urN numa-sched-ref/include/asm-mips64/timex.h numa-sched/include/asm-mips64/timex.h --- numa-sched-ref/include/asm-mips64/timex.h Sun Sep 23 21:11:41 2001 +++ numa-sched/include/asm-mips64/timex.h Sat Dec 8 13:23:50 2001 @@ -43,4 +43,8 @@ return val; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif /* _ASM_TIMEX_H */ diff -urN numa-sched-ref/include/asm-parisc/timex.h numa-sched/include/asm-parisc/timex.h --- numa-sched-ref/include/asm-parisc/timex.h Thu Dec 14 22:34:13 2000 +++ numa-sched/include/asm-parisc/timex.h Sat Dec 8 13:23:50 2001 @@ -18,4 +18,8 @@ return mfctl(16); } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa-sched-ref/include/asm-ppc/timex.h numa-sched/include/asm-ppc/timex.h --- numa-sched-ref/include/asm-ppc/timex.h Sun Sep 23 21:11:41 2001 +++ numa-sched/include/asm-ppc/timex.h Sat Dec 8 13:23:50 2001 @@ -45,5 +45,9 @@ return ret; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif #endif /* __KERNEL__ */ diff -urN numa-sched-ref/include/asm-s390/timex.h numa-sched/include/asm-s390/timex.h --- numa-sched-ref/include/asm-s390/timex.h Fri May 12 20:41:44 2000 +++ numa-sched/include/asm-s390/timex.h Sat Dec 8 13:23:50 2001 @@ -26,4 +26,8 @@ return 0; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa-sched-ref/include/asm-s390x/timex.h numa-sched/include/asm-s390x/timex.h --- numa-sched-ref/include/asm-s390x/timex.h Thu Feb 22 03:45:11 2001 +++ numa-sched/include/asm-s390x/timex.h Sat Dec 8 13:23:50 2001 @@ -26,4 +26,8 @@ return 0; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa-sched-ref/include/asm-sh/timex.h numa-sched/include/asm-sh/timex.h --- numa-sched-ref/include/asm-sh/timex.h Fri Jan 5 02:19:29 2001 +++ numa-sched/include/asm-sh/timex.h Sat Dec 8 13:23:50 2001 @@ -21,4 +21,8 @@ return 0; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif /* __ASM_SH_TIMEX_H */ diff -urN numa-sched-ref/include/asm-sparc/timex.h numa-sched/include/asm-sparc/timex.h --- numa-sched-ref/include/asm-sparc/timex.h Thu Mar 11 01:53:37 1999 +++ numa-sched/include/asm-sparc/timex.h Sat Dec 8 13:23:50 2001 @@ -17,4 +17,8 @@ extern cycles_t cacheflush_time; #define get_cycles() (0) +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa-sched-ref/include/asm-sparc64/timex.h numa-sched/include/asm-sparc64/timex.h --- numa-sched-ref/include/asm-sparc64/timex.h Sun Sep 23 21:11:42 2001 +++ numa-sched/include/asm-sparc64/timex.h Sat Dec 8 13:23:50 2001 @@ -20,4 +20,8 @@ ret; \ }) +typedef cycles_t last_schedule_t; +#define get_last_schedule() ({ get_cycles(); }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa-sched-ref/include/linux/numa_sched.h numa-sched/include/linux/numa_sched.h --- numa-sched-ref/include/linux/numa_sched.h Thu Jan 1 01:00:00 1970 +++ numa-sched/include/linux/numa_sched.h Sat Dec 8 13:23:50 2001 @@ -0,0 +1,67 @@ +/* + * linux/include/linux/numa_sched.h + * + * NUMA based scheduler + */ + +#ifndef _LINUX_NUMA_SCHED_H +#define _LINUX_NUMA_SCHED_H + +#ifdef CONFIG_NUMA_SCHED +#include +#include +#include +#include + +struct numa_per_cpu_schedule_data { + struct task_struct * curr; + last_schedule_t last_schedule; + long quiescent; +}; + +struct numa_schedule_data { + struct numa_per_cpu_schedule_data per_cpu[NR_CPUS] ____cacheline_aligned; + struct list_head runqueue_head; + int nr_running, nr_threads; +}; + +#include + +#define numa_nr_running_inc() do { NODE_SCHEDULE_DATA(numa_node_id())->nr_running++; } while(0) +#define numa_nr_running_dec() do { NODE_SCHEDULE_DATA(numa_node_id())->nr_running--; } while(0) +#define numa_nr_running(nid) (NODE_SCHEDULE_DATA(nid)->nr_running) + +#define numa_nr_threads_inc() do { NODE_SCHEDULE_DATA(numa_node_id())->nr_threads++; } while(0) +#define numa_nr_threads_dec() do { NODE_SCHEDULE_DATA(numa_node_id())->nr_threads--; } while(0) +#define numa_nr_threads(nid) (NODE_SCHEDULE_DATA(nid)->nr_threads) + +#define cpu_curr(cpu) (NODE_SCHEDULE_DATA(cputonode(cpu))->per_cpu[(cpu)].curr) +#define last_schedule(cpu) (NODE_SCHEDULE_DATA(cputonode(cpu))->per_cpu[(cpu)].last_schedule) +#define RCU_quiescent(cpu) (NODE_SCHEDULE_DATA(cputonode(cpu))->per_cpu[(cpu)].quiescent) + +#define numa_runqueue_head(x) (&NODE_SCHEDULE_DATA(x)->runqueue_head) + +#else /* CONFIG_NUMA_SCHED */ + +#define numa_nr_running_inc() do { } while(0) +#define numa_nr_running_dec() do { } while(0) +#define numa_nr_threads_inc() do { } while(0) +#define numa_nr_threads_dec() do { } while(0) + +/* per-cpu schedule data */ +typedef struct schedule_data_s { + struct task_struct * curr; + last_schedule_t last_schedule; + long quiescent; +} schedule_data_t ____cacheline_aligned; + +extern schedule_data_t schedule_data[NR_CPUS]; + +#define cpu_curr(cpu) (schedule_data[(cpu)].curr) +#define last_schedule(cpu) (schedule_data[(cpu)].last_schedule) +#define RCU_quiescent(cpu) (schedule_data[(cpu)].quiescent) + +#define numa_runqueue_head(x) (&runqueue_head) +#endif /* CONFIG_NUMA_SCHED */ + +#endif /* __ALPHA_NUMA_SCHED_H */ diff -urN numa-sched-ref/include/linux/sched.h numa-sched/include/linux/sched.h --- numa-sched-ref/include/linux/sched.h Sat Dec 8 13:23:26 2001 +++ numa-sched/include/linux/sched.h Sat Dec 8 13:24:16 2001 @@ -27,6 +27,7 @@ #include #include #include +#include struct exec_domain; @@ -302,13 +303,13 @@ * all fields in a single cacheline that are needed for * the goodness() loop in schedule(). */ - long counter; - long nice; - unsigned long policy; + int counter; + int nice; + unsigned int policy; struct mm_struct *mm; int processor; /* - * cpus_runnable is ~0 if the process is not running on any + * cpus_runnable is ~0UL if the process is not running on any * CPU. It's (1 << cpu) if it's running on a CPU. This mask * is updated under the runqueue lock. * @@ -321,8 +322,9 @@ * that's just fine.) */ struct list_head run_list; - unsigned long sleep_time; - +#ifdef CONFIG_NUMA_SCHED + int nid; +#endif struct task_struct *next_task, *prev_task; struct mm_struct *active_mm; struct list_head local_pages; @@ -474,8 +476,8 @@ policy: SCHED_OTHER, \ mm: NULL, \ active_mm: &init_mm, \ - cpus_runnable: -1, \ - cpus_allowed: -1, \ + cpus_runnable: -1UL, \ + cpus_allowed: -1UL, \ run_list: LIST_HEAD_INIT(tsk.run_list), \ next_task: &tsk, \ prev_task: &tsk, \ @@ -519,18 +521,6 @@ extern struct mm_struct init_mm; extern struct task_struct *init_tasks[NR_CPUS]; -/* per-cpu schedule data */ -typedef struct schedule_data_s { - struct task_struct * curr; - cycles_t last_schedule; - long quiescent; -} schedule_data_t ____cacheline_aligned; - -extern schedule_data_t schedule_data[NR_CPUS]; -#define cpu_curr(cpu) (schedule_data[(cpu)].curr) -#define last_schedule(cpu) (schedule_data[(cpu)].last_schedule) -#define RCU_quiescent(cpu) (schedule_data[(cpu)].quiescent) - /* PID hashing. (shouldnt this be dynamic?) */ #define PIDHASH_SZ (4096 >> 2) extern struct task_struct *pidhash[PIDHASH_SZ]; @@ -807,6 +797,30 @@ extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); +#define nr_running_inc() \ +do { \ + numa_nr_running_inc(); \ + nr_running++; \ +} while (0) + +#define nr_running_dec() \ +do { \ + numa_nr_running_dec(); \ + nr_running--; \ +} while (0) + +#define nr_threads_inc() \ +do { \ + numa_nr_threads_inc(); \ + nr_threads++; \ +} while (0) + +#define nr_threads_dec() \ +do { \ + numa_nr_threads_dec(); \ + nr_threads--; \ +} while (0) + #define __wait_event(wq, condition) \ do { \ wait_queue_t __wait; \ @@ -887,29 +901,28 @@ #define next_thread(p) \ list_entry((p)->thread_group.next, struct task_struct, thread_group) -static inline void del_from_runqueue(struct task_struct * p) -{ - nr_running--; - p->sleep_time = jiffies; - list_del(&p->run_list); - p->run_list.next = NULL; -} +#define del_from_runqueue(p) \ +do { \ + nr_running_dec(); \ + list_del(&(p)->run_list); \ + (p)->run_list.next = NULL; \ +} while(0) static inline int task_on_runqueue(struct task_struct *p) { return (p->run_list.next != NULL); } -static inline void unhash_process(struct task_struct *p) -{ - if (task_on_runqueue(p)) BUG(); - write_lock_irq(&tasklist_lock); - nr_threads--; - unhash_pid(p); - REMOVE_LINKS(p); - list_del(&p->thread_group); - write_unlock_irq(&tasklist_lock); -} +#define unhash_process(p) \ +do { \ + if (task_on_runqueue(p)) BUG(); \ + write_lock_irq(&tasklist_lock); \ + nr_threads_dec(); \ + unhash_pid(p); \ + REMOVE_LINKS(p); \ + list_del(&(p)->thread_group); \ + write_unlock_irq(&tasklist_lock); \ +} while(0) /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ static inline void task_lock(struct task_struct *p) diff -urN numa-sched-ref/kernel/fork.c numa-sched/kernel/fork.c --- numa-sched-ref/kernel/fork.c Fri Nov 23 08:21:05 2001 +++ numa-sched/kernel/fork.c Sat Dec 8 13:23:50 2001 @@ -639,7 +639,6 @@ { int i; p->cpus_runnable = ~0UL; - p->processor = current->processor; /* ?? should we just memset this ?? */ for(i = 0; i < smp_num_cpus; i++) p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0; @@ -716,7 +715,7 @@ SET_LINKS(p); hash_pid(p); - nr_threads++; + nr_threads_inc(); write_unlock_irq(&tasklist_lock); if (p->ptrace & PT_PTRACED) diff -urN numa-sched-ref/kernel/sched.c numa-sched/kernel/sched.c --- numa-sched-ref/kernel/sched.c Sat Dec 8 13:23:26 2001 +++ numa-sched/kernel/sched.c Sat Dec 8 13:32:19 2001 @@ -10,6 +10,7 @@ * 1998-11-19 Implemented schedule_timeout() and related stuff * by Andrea Arcangeli * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar + * 2001-01-29 first NUMA scheduler attempt by Andrea Arcangeli, SuSE */ /* @@ -92,6 +93,8 @@ spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ +#ifndef CONFIG_NUMA_SCHED + static LIST_HEAD(runqueue_head); /* @@ -100,14 +103,31 @@ */ schedule_data_t schedule_data[NR_CPUS] __cacheline_aligned = {{&init_task,0}}; +#define init_numa_schedule_data() do { } while(0) + +#else /* CONFIG_NUMA_SCHED */ + +static void __init init_numa_schedule_data(void) +{ + int i; + + for (i = 0; i < numnodes; i++) { + INIT_LIST_HEAD(&NODE_SCHEDULE_DATA(i)->runqueue_head); + NODE_SCHEDULE_DATA(i)->nr_running = 0; + NODE_SCHEDULE_DATA(i)->nr_threads = 0; + } +} +#endif /* CONFIG_NUMA_SCHED */ + struct kernel_stat kstat; extern struct task_struct *child_reaper; #ifdef CONFIG_SMP #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) +#define logical_idle_task(cpu) (init_tasks[cpu]) #define can_schedule(p,cpu) \ - ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu)) + ((p)->cpus_runnable & (p)->cpus_allowed & (1UL << cpu)) #else @@ -205,8 +225,8 @@ #ifdef CONFIG_SMP int this_cpu = smp_processor_id(); struct task_struct *tsk, *target_tsk; - int cpu, best_cpu, i, max_prio; - cycles_t oldest_idle; + int cpu, best_cpu, i, max_prio, found_idle; + last_schedule_t oldest_idle; /* * shortcut if the woken up task's last CPU is @@ -214,17 +234,17 @@ */ best_cpu = p->processor; if (can_schedule(p, best_cpu)) { - tsk = idle_task(best_cpu); - if (cpu_curr(best_cpu) == tsk) { - int need_resched; + target_tsk = idle_task(best_cpu); + if (cpu_curr(best_cpu) == target_tsk) { + long need_resched; send_now_idle: /* * If need_resched == -1 then we can skip sending * the IPI altogether, tsk->need_resched is * actively watched by the idle thread. */ - need_resched = tsk->need_resched; - tsk->need_resched = 1; + need_resched = target_tsk->need_resched; + target_tsk->need_resched = 1; if ((best_cpu != this_cpu) && !need_resched) smp_send_reschedule(best_cpu); return; @@ -238,13 +258,17 @@ * one will have the least active cache context.) Also find * the executing process which has the least priority. */ - oldest_idle = (cycles_t) -1; target_tsk = NULL; max_prio = 0; + found_idle = 0; for (i = 0; i < smp_num_cpus; i++) { cpu = cpu_logical_map(i); - if (!can_schedule(p, cpu)) + if ( +#ifdef CONFIG_NUMA_SCHED + cputonode(cpu) != p->nid || +#endif + !can_schedule(p, cpu)) continue; tsk = cpu_curr(cpu); /* @@ -252,7 +276,7 @@ * a priority list between idle CPUs, but this is not * a problem. */ - if (tsk == idle_task(cpu)) { + if (tsk == logical_idle_task(i)) { #if defined(__i386__) && defined(CONFIG_SMP) /* * Check if two siblings are idle in the same @@ -263,17 +287,19 @@ idle_task(cpu_sibling_map[cpu])) { oldest_idle = last_schedule(cpu); target_tsk = tsk; + found_idle = 1; break; } } #endif - if (last_schedule(cpu) < oldest_idle) { + if (!found_idle || last_schedule_before(last_schedule(cpu), oldest_idle)) { oldest_idle = last_schedule(cpu); target_tsk = tsk; + found_idle = 1; } } else { - if (oldest_idle == -1ULL) { + if (!found_idle) { int prio = preemption_goodness(tsk, p, cpu); if (prio > max_prio) { @@ -283,15 +309,33 @@ } } } - tsk = target_tsk; - if (tsk) { - if (oldest_idle != -1ULL) { - best_cpu = tsk->processor; - goto send_now_idle; + +#ifdef CONFIG_NUMA_SCHED + if (!target_tsk) + /* Make sure to use the idle cpus in the other nodes */ + for (i = 0; i < smp_num_cpus; i++) { + cpu = cpu_logical_map(i); + if (cputonode(cpu) == p->nid || !can_schedule(p, cpu)) + continue; + tsk = cpu_curr(cpu); + if (tsk == logical_idle_task(i)) { + if (!found_idle || last_schedule_before(last_schedule(cpu), oldest_idle)) { + oldest_idle = last_schedule(cpu); + target_tsk = tsk; + found_idle = 1; + target_tsk->nid = cputonode(cpu); + } + } } - tsk->need_resched = 1; - if (tsk->processor != this_cpu) - smp_send_reschedule(tsk->processor); +#endif + + if (target_tsk) { + best_cpu = target_tsk->processor; + if (found_idle) + goto send_now_idle; + target_tsk->need_resched = 1; + if (best_cpu != this_cpu) + smp_send_reschedule(best_cpu); } return; @@ -315,20 +359,20 @@ */ static inline void add_to_runqueue(struct task_struct * p) { - list_add(&p->run_list, &runqueue_head); - nr_running++; + list_add(&p->run_list, numa_runqueue_head(p->nid)); + nr_running_inc(); } static inline void move_last_runqueue(struct task_struct * p) { list_del(&p->run_list); - list_add_tail(&p->run_list, &runqueue_head); + list_add_tail(&p->run_list, numa_runqueue_head(p->nid)); } static inline void move_first_runqueue(struct task_struct * p) { list_del(&p->run_list); - list_add(&p->run_list, &runqueue_head); + list_add(&p->run_list, numa_runqueue_head(p->nid)); } /* @@ -351,9 +395,9 @@ p->state = TASK_RUNNING; if (task_on_runqueue(p)) goto out; - add_to_runqueue(p); if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))) reschedule_idle(p); + add_to_runqueue(p); success = 1; out: spin_unlock_irqrestore(&runqueue_lock, flags); @@ -539,10 +583,12 @@ */ asmlinkage void schedule(void) { - schedule_data_t * sched_data; struct task_struct *prev, *next, *p; struct list_head *tmp; int this_cpu, c; +#ifdef CONFIG_NUMA_SCHED + int recalculate_all; +#endif spin_lock_prefetch(&runqueue_lock); @@ -559,12 +605,6 @@ release_kernel_lock(prev, this_cpu); - /* - * 'sched_data' is protected by the fact that we can run - * only one process per CPU. - */ - sched_data = &schedule_data[this_cpu]; - spin_lock_irq(&runqueue_lock); /* move an exhausted RR process to be last.. */ @@ -596,7 +636,7 @@ */ next = idle_task(this_cpu); c = -1000; - list_for_each(tmp, &runqueue_head) { + list_for_each(tmp, numa_runqueue_head(numa_node_id())) { p = list_entry(tmp, struct task_struct, run_list); if (can_schedule(p, this_cpu)) { int weight = goodness(p, this_cpu, prev->active_mm); @@ -605,14 +645,40 @@ } } +#ifdef CONFIG_NUMA_SCHED + recalculate_all = 0; + if (c < 0) { + int nid; + + recalculate_all = 1; + for (nid = 0; nid < numnodes; nid++) { + if (nid == numa_node_id()) + continue; + list_for_each(tmp, numa_runqueue_head(nid)) { + p = list_entry(tmp, struct task_struct, run_list); + if (can_schedule(p, this_cpu)) { + int weight = goodness(p, this_cpu, prev->active_mm); + if (weight > c) + c = weight, next = p; + } + } + } + } +#endif + /* Do we need to re-calculate counters? */ if (unlikely(!c)) { struct task_struct *p; spin_unlock_irq(&runqueue_lock); read_lock(&tasklist_lock); - for_each_task(p) + for_each_task(p) { +#ifdef CONFIG_NUMA_SCHED + if (!recalculate_all && p->nid != numa_node_id()) + continue; +#endif p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); + } read_unlock(&tasklist_lock); spin_lock_irq(&runqueue_lock); goto repeat_schedule; @@ -623,8 +689,14 @@ * switching to the next task, save this fact in * sched_data. */ - sched_data->curr = next; + cpu_curr(this_cpu) = next; task_set_cpu(next, this_cpu); +#if defined(CONFIG_SMP) && defined(CONFIG_NUMA_SCHED) + if (next != idle_task(this_cpu) && next->nid != numa_node_id()) { + next->nid = numa_node_id(); + move_last_runqueue(next); + } +#endif spin_unlock_irq(&runqueue_lock); if (unlikely(prev == next)) { @@ -641,7 +713,7 @@ * and it's approximate, so we do not have to maintain * it while holding the runqueue spinlock. */ - sched_data->last_schedule = get_cycles(); + last_schedule(this_cpu) = get_last_schedule(); /* * We drop the scheduler lock early (it's a global spinlock), @@ -1050,7 +1122,7 @@ // Subtract non-idle processes running on other CPUs. for (i = 0; i < smp_num_cpus; i++) { int cpu = cpu_logical_map(i); - if (cpu_curr(cpu) != idle_task(cpu)) + if (cpu_curr(cpu) != logical_idle_task(i)) nr_pending--; } #else @@ -1303,16 +1375,15 @@ void __init init_idle(void) { - schedule_data_t * sched_data; - sched_data = &schedule_data[smp_processor_id()]; + int cpu = smp_processor_id(); if (current != &init_task && task_on_runqueue(current)) { printk("UGH! (%d:%d) was on the runqueue, removing.\n", smp_processor_id(), current->pid); del_from_runqueue(current); } - sched_data->curr = current; - sched_data->last_schedule = get_cycles(); + cpu_curr(cpu) = current; + last_schedule(cpu) = get_last_schedule(); clear_bit(current->processor, &wait_init_idle); } @@ -1343,4 +1414,6 @@ */ atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current, cpu); + + init_numa_schedule_data(); }