From 4595f9620cda8a1e973588e743cf5f8436dd20c6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 10 Jan 2009 21:58:09 -0800 Subject: x86: change flush_tlb_others to take a const struct cpumask Impact: reduce stack usage, use new cpumask API. This is made a little more tricky by uv_flush_tlb_others which actually alters its argument, for an IPI to be sent to the remaining cpus in the mask. I solve this by allocating a cpumask_var_t for this case and falling back to IPI should this fail. To eliminate temporaries in the caller, all flush_tlb_others implementations now do the this-cpu-elimination step themselves. Note also the curious "cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask)" which has been there since pre-git and yet f->flush_cpumask is always zero at this point. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- arch/x86/kernel/tlb_32.c | 67 +++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 38 deletions(-) (limited to 'arch/x86/kernel/tlb_32.c') diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index ce5054642247..ec53818f4e38 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -20,7 +20,7 @@ DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) * Optimizations Manfred Spraul */ -static cpumask_t flush_cpumask; +static cpumask_var_t flush_cpumask; static struct mm_struct *flush_mm; static unsigned long flush_va; static DEFINE_SPINLOCK(tlbstate_lock); @@ -92,7 +92,7 @@ void smp_invalidate_interrupt(struct pt_regs *regs) cpu = get_cpu(); - if (!cpu_isset(cpu, flush_cpumask)) + if (!cpumask_test_cpu(cpu, flush_cpumask)) goto out; /* * This was a BUG() but until someone can quote me the @@ -114,35 +114,22 @@ void smp_invalidate_interrupt(struct pt_regs *regs) } ack_APIC_irq(); smp_mb__before_clear_bit(); - cpu_clear(cpu, flush_cpumask); + cpumask_clear_cpu(cpu, flush_cpumask); smp_mb__after_clear_bit(); out: put_cpu_no_resched(); inc_irq_stat(irq_tlb_count); } -void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, - unsigned long va) +void native_flush_tlb_others(const struct cpumask *cpumask, + struct mm_struct *mm, unsigned long va) { - cpumask_t cpumask = *cpumaskp; - /* - * A couple of (to be removed) sanity checks: - * - * - current CPU must not be in mask * - mask must exist :) */ - BUG_ON(cpus_empty(cpumask)); - BUG_ON(cpu_isset(smp_processor_id(), cpumask)); + BUG_ON(cpumask_empty(cpumask)); BUG_ON(!mm); -#ifdef CONFIG_HOTPLUG_CPU - /* If a CPU which we ran on has gone down, OK. */ - cpus_and(cpumask, cpumask, cpu_online_map); - if (unlikely(cpus_empty(cpumask))) - return; -#endif - /* * i'm not happy about this global shared spinlock in the * MM hot path, but we'll see how contended it is. @@ -150,9 +137,17 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, */ spin_lock(&tlbstate_lock); + cpumask_andnot(flush_cpumask, cpumask, cpumask_of(smp_processor_id())); +#ifdef CONFIG_HOTPLUG_CPU + /* If a CPU which we ran on has gone down, OK. */ + cpumask_and(flush_cpumask, flush_cpumask, cpu_online_mask); + if (unlikely(cpumask_empty(flush_cpumask))) { + spin_unlock(&tlbstate_lock); + return; + } +#endif flush_mm = mm; flush_va = va; - cpus_or(flush_cpumask, cpumask, flush_cpumask); /* * Make the above memory operations globally visible before @@ -163,9 +158,9 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR); + send_IPI_mask(flush_cpumask, INVALIDATE_TLB_VECTOR); - while (!cpus_empty(flush_cpumask)) + while (!cpumask_empty(flush_cpumask)) /* nothing. lockup detection does not belong here */ cpu_relax(); @@ -177,25 +172,19 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, void flush_tlb_current_task(void) { struct mm_struct *mm = current->mm; - cpumask_t cpu_mask; preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); local_flush_tlb(); - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } void flush_tlb_mm(struct mm_struct *mm) { - cpumask_t cpu_mask; preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); if (current->active_mm == mm) { if (current->mm) @@ -203,8 +192,8 @@ void flush_tlb_mm(struct mm_struct *mm) else leave_mm(smp_processor_id()); } - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } @@ -212,11 +201,8 @@ void flush_tlb_mm(struct mm_struct *mm) void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) { struct mm_struct *mm = vma->vm_mm; - cpumask_t cpu_mask; preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); if (current->active_mm == mm) { if (current->mm) @@ -225,9 +211,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) leave_mm(smp_processor_id()); } - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, va); - + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, va); preempt_enable(); } EXPORT_SYMBOL(flush_tlb_page); @@ -254,3 +239,9 @@ void reset_lazy_tlbstate(void) per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; } +static int init_flush_cpumask(void) +{ + alloc_cpumask_var(&flush_cpumask, GFP_KERNEL); + return 0; +} +early_initcall(init_flush_cpumask); -- cgit v1.2.3 From 6dbde3530850d4d8bfc1b6bd4006d92786a2787f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 Jan 2009 22:15:53 +0900 Subject: percpu: add optimized generic percpu accessors It is an optimization and a cleanup, and adds the following new generic percpu methods: percpu_read() percpu_write() percpu_add() percpu_sub() percpu_and() percpu_or() percpu_xor() and implements support for them on x86. (other architectures will fall back to a default implementation) The advantage is that for example to read a local percpu variable, instead of this sequence: return __get_cpu_var(var); ffffffff8102ca2b: 48 8b 14 fd 80 09 74 mov -0x7e8bf680(,%rdi,8),%rdx ffffffff8102ca32: 81 ffffffff8102ca33: 48 c7 c0 d8 59 00 00 mov $0x59d8,%rax ffffffff8102ca3a: 48 8b 04 10 mov (%rax,%rdx,1),%rax We can get a single instruction by using the optimized variants: return percpu_read(var); ffffffff8102ca3f: 65 48 8b 05 91 8f fd mov %gs:0x7efd8f91(%rip),%rax I also cleaned up the x86-specific APIs and made the x86 code use these new generic percpu primitives. tj: * fixed generic percpu_sub() definition as Roel Kluin pointed out * added percpu_and() for completeness's sake * made generic percpu ops atomic against preemption Signed-off-by: Ingo Molnar Signed-off-by: Tejun Heo --- arch/x86/kernel/tlb_32.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel/tlb_32.c') diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index ec53818f4e38..e65449d0f7d9 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -34,8 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock); */ void leave_mm(int cpu) { - BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK); - cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask); + BUG_ON(percpu_read(cpu_tlbstate.state) == TLBSTATE_OK); + cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); load_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@ -103,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs) * BUG(); */ - if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) { - if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) { + if (flush_mm == percpu_read(cpu_tlbstate.active_mm)) { + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else @@ -222,7 +222,7 @@ static void do_flush_tlb_all(void *info) unsigned long cpu = smp_processor_id(); __flush_tlb_all(); - if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY) + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(cpu); } -- cgit v1.2.3 From 9eb912d1aa6b8106e06a73ea6702ec3dab0d6a1a Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 19 Jan 2009 00:38:57 +0900 Subject: x86-64: Move TLB state from PDA to per-cpu and consolidate with 32-bit. Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/tlb_32.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel/tlb_32.c') diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index e65449d0f7d9..abf0808d6fc4 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -4,8 +4,8 @@ #include -DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) - ____cacheline_aligned = { &init_mm, 0, }; +DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) + = { &init_mm, 0, }; /* must come after the send_IPI functions above for inlining */ #include @@ -231,14 +231,6 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, NULL, 1); } -void reset_lazy_tlbstate(void) -{ - int cpu = raw_smp_processor_id(); - - per_cpu(cpu_tlbstate, cpu).state = 0; - per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; -} - static int init_flush_cpumask(void) { alloc_cpumask_var(&flush_cpumask, GFP_KERNEL); -- cgit v1.2.3 From 6dd01bedee6c3191643db303a1dc530bad56ec55 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 21 Jan 2009 17:26:06 +0900 Subject: x86: prepare for tlb merge Impact: clean up, ipi vector number reordering for x86_32 Make the following changes to prepare for tlb merge. * reorder x86_32 ip vectors * adjust tlb_32.c and tlb_64.c such that their logics coincide exactly - on spurious invalidate ipi, tlb_32 acks the irq - tlb_64 now has proper memory barriers around clearing flush_cpumask (no change in generated code) * unexport flush_tlb_page from tlb_32.c, there's no user * use unsigned int for cpu id * drop unnecessary includes from tlb_64.c Signed-off-by: Tejun Heo --- arch/x86/kernel/tlb_32.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel/tlb_32.c') diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index abf0808d6fc4..93fcb05c7d43 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -84,13 +84,15 @@ EXPORT_SYMBOL_GPL(leave_mm); * * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. * 2) Leave the mm if we are in the lazy tlb mode. + * + * Interrupts are disabled. */ void smp_invalidate_interrupt(struct pt_regs *regs) { - unsigned long cpu; + unsigned int cpu; - cpu = get_cpu(); + cpu = smp_processor_id(); if (!cpumask_test_cpu(cpu, flush_cpumask)) goto out; @@ -112,12 +114,11 @@ void smp_invalidate_interrupt(struct pt_regs *regs) } else leave_mm(cpu); } +out: ack_APIC_irq(); smp_mb__before_clear_bit(); cpumask_clear_cpu(cpu, flush_cpumask); smp_mb__after_clear_bit(); -out: - put_cpu_no_resched(); inc_irq_stat(irq_tlb_count); } @@ -215,7 +216,6 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) flush_tlb_others(&mm->cpu_vm_mask, mm, va); preempt_enable(); } -EXPORT_SYMBOL(flush_tlb_page); static void do_flush_tlb_all(void *info) { -- cgit v1.2.3 From 02cf94c370e0dc9bf408fe45eb86fe9ad58eaf7f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 21 Jan 2009 17:26:06 +0900 Subject: x86: make x86_32 use tlb_64.c Impact: less contention when issuing invalidate IPI, cleanup Make x86_32 use the same tlb code as 64bit. The 64bit code uses multiple IPI vectors for tlb shootdown to reduce contention. This patch makes x86_32 allocate the same 8 IPIs as x86_64 and share the code paths. Note that the usage of asmlinkage is inconsistent for x86_32 and 64 and calls for further cleanup. This has been noted with a FIXME comment in tlb_64.c. Signed-off-by: Tejun Heo --- arch/x86/kernel/tlb_32.c | 239 ----------------------------------------------- 1 file changed, 239 deletions(-) delete mode 100644 arch/x86/kernel/tlb_32.c (limited to 'arch/x86/kernel/tlb_32.c') diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c deleted file mode 100644 index 93fcb05c7d43..000000000000 --- a/arch/x86/kernel/tlb_32.c +++ /dev/null @@ -1,239 +0,0 @@ -#include -#include -#include - -#include - -DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) - = { &init_mm, 0, }; - -/* must come after the send_IPI functions above for inlining */ -#include - -/* - * Smarter SMP flushing macros. - * c/o Linus Torvalds. - * - * These mean you can really definitely utterly forget about - * writing to user space from interrupts. (Its not allowed anyway). - * - * Optimizations Manfred Spraul - */ - -static cpumask_var_t flush_cpumask; -static struct mm_struct *flush_mm; -static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); - -/* - * We cannot call mmdrop() because we are in interrupt context, - * instead update mm->cpu_vm_mask. - * - * We need to reload %cr3 since the page tables may be going - * away from under us.. - */ -void leave_mm(int cpu) -{ - BUG_ON(percpu_read(cpu_tlbstate.state) == TLBSTATE_OK); - cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); - load_cr3(swapper_pg_dir); -} -EXPORT_SYMBOL_GPL(leave_mm); - -/* - * - * The flush IPI assumes that a thread switch happens in this order: - * [cpu0: the cpu that switches] - * 1) switch_mm() either 1a) or 1b) - * 1a) thread switch to a different mm - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superfluous - * tlb flush. - * 1a2) set cpu_tlbstate to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - * was in lazy tlb mode. - * 1a3) update cpu_tlbstate[].active_mm - * Now cpu0 accepts tlb flushes for the new mm. - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. - * 1a4) change cr3. - * 1b) thread switch without mm change - * cpu_tlbstate[].active_mm is correct, cpu0 already handles - * flush ipis. - * 1b1) set cpu_tlbstate to TLBSTATE_OK - * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. - * 1b3) if the bit was 0: leave_mm was called, flush the tlb. - * 2) switch %%esp, ie current - * - * The interrupt must handle 2 special cases: - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. - * - the cpu performs speculative tlb reads, i.e. even if the cpu only - * runs in kernel space, the cpu could load tlb entries for user space - * pages. - * - * The good news is that cpu_tlbstate is local to each cpu, no - * write/read ordering problems. - */ - -/* - * TLB flush IPI: - * - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. - * 2) Leave the mm if we are in the lazy tlb mode. - * - * Interrupts are disabled. - */ - -void smp_invalidate_interrupt(struct pt_regs *regs) -{ - unsigned int cpu; - - cpu = smp_processor_id(); - - if (!cpumask_test_cpu(cpu, flush_cpumask)) - goto out; - /* - * This was a BUG() but until someone can quote me the - * line from the intel manual that guarantees an IPI to - * multiple CPUs is retried _only_ on the erroring CPUs - * its staying as a return - * - * BUG(); - */ - - if (flush_mm == percpu_read(cpu_tlbstate.active_mm)) { - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { - if (flush_va == TLB_FLUSH_ALL) - local_flush_tlb(); - else - __flush_tlb_one(flush_va); - } else - leave_mm(cpu); - } -out: - ack_APIC_irq(); - smp_mb__before_clear_bit(); - cpumask_clear_cpu(cpu, flush_cpumask); - smp_mb__after_clear_bit(); - inc_irq_stat(irq_tlb_count); -} - -void native_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long va) -{ - /* - * - mask must exist :) - */ - BUG_ON(cpumask_empty(cpumask)); - BUG_ON(!mm); - - /* - * i'm not happy about this global shared spinlock in the - * MM hot path, but we'll see how contended it is. - * AK: x86-64 has a faster method that could be ported. - */ - spin_lock(&tlbstate_lock); - - cpumask_andnot(flush_cpumask, cpumask, cpumask_of(smp_processor_id())); -#ifdef CONFIG_HOTPLUG_CPU - /* If a CPU which we ran on has gone down, OK. */ - cpumask_and(flush_cpumask, flush_cpumask, cpu_online_mask); - if (unlikely(cpumask_empty(flush_cpumask))) { - spin_unlock(&tlbstate_lock); - return; - } -#endif - flush_mm = mm; - flush_va = va; - - /* - * Make the above memory operations globally visible before - * sending the IPI. - */ - smp_mb(); - /* - * We have to send the IPI only to - * CPUs affected. - */ - send_IPI_mask(flush_cpumask, INVALIDATE_TLB_VECTOR); - - while (!cpumask_empty(flush_cpumask)) - /* nothing. lockup detection does not belong here */ - cpu_relax(); - - flush_mm = NULL; - flush_va = 0; - spin_unlock(&tlbstate_lock); -} - -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - - preempt_disable(); - - local_flush_tlb(); - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); - preempt_enable(); -} - -void flush_tlb_mm(struct mm_struct *mm) -{ - - preempt_disable(); - - if (current->active_mm == mm) { - if (current->mm) - local_flush_tlb(); - else - leave_mm(smp_processor_id()); - } - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); - - preempt_enable(); -} - -void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) -{ - struct mm_struct *mm = vma->vm_mm; - - preempt_disable(); - - if (current->active_mm == mm) { - if (current->mm) - __flush_tlb_one(va); - else - leave_mm(smp_processor_id()); - } - - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, va); - preempt_enable(); -} - -static void do_flush_tlb_all(void *info) -{ - unsigned long cpu = smp_processor_id(); - - __flush_tlb_all(); - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) - leave_mm(cpu); -} - -void flush_tlb_all(void) -{ - on_each_cpu(do_flush_tlb_all, NULL, 1); -} - -static int init_flush_cpumask(void) -{ - alloc_cpumask_var(&flush_cpumask, GFP_KERNEL); - return 0; -} -early_initcall(init_flush_cpumask); -- cgit v1.2.3