From e35c76cd47c244eaa7a74adaabde4d0a1cadb907 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 20 Apr 2011 10:15:34 +0200 Subject: [S390] pfault: fix token handling f6649a7e "[S390] cleanup lowcore access from external interrupts" changed handling of external interrupts. Instead of letting the external interrupt handlers accessing the per cpu lowcore the entry code of the kernel reads already all fields that are necessary and passes them to the handlers. The pfault interrupt handler was incorrectly converted. It tries to dereference a value which used to be a pointer to a lowcore field. After the conversion however it is not anymore the pointer to the field but its content. So instead of a dereference only a cast is needed to get the task pointer that caused the pfault. Fixes a NULL pointer dereference and a subsequent kernel crash: Unable to handle kernel pointer dereference at virtual kernel address (null) Oops: 0004 [#1] SMP Modules linked in: nfsd exportfs nfs lockd fscache nfs_acl auth_rpcgss sunrpc loop qeth_l3 qeth vmur ccwgroup ext3 jbd mbcache dm_mod dasd_eckd_mod dasd_diag_mod dasd_mod CPU: 0 Not tainted 2.6.38-2-s390x #1 Process cron (pid: 1106, task: 000000001f962f78, ksp: 000000001fa0f9d0) Krnl PSW : 0404200180000000 000000000002c03e (pfault_interrupt+0xa2/0x138) R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:0 CC:2 PM:0 EA:3 Krnl GPRS: 0000000000000000 0000000000000001 0000000000000000 0000000000000001 000000001f962f78 0000000000518968 0000000090000002 000000001ff03280 0000000000000000 000000000064f000 000000001f962f78 0000000000002603 0000000006002603 0000000000000000 000000001ff7fe68 000000001ff7fe48 Krnl Code: 000000000002c036: 5820d010 l %r2,16(%r13) 000000000002c03a: 1832 lr %r3,%r2 000000000002c03c: 1a31 ar %r3,%r1 >000000000002c03e: ba23d010 cs %r2,%r3,16(%r13) 000000000002c042: a744fffc brc 4,2c03a 000000000002c046: a7290002 lghi %r2,2 000000000002c04a: e320d0000024 stg %r2,0(%r13) 000000000002c050: 07f0 bcr 15,%r0 Call Trace: ([<000000001f962f78>] 0x1f962f78) [<000000000001acda>] do_extint+0xf6/0x138 [<000000000039b6ca>] ext_no_vtime+0x30/0x34 [<000000007d706e04>] 0x7d706e04 Last Breaking-Event-Address: [<0000000000000000>] 0x0 For stable maintainers: the first kernel which contains this bug is 2.6.37. Reported-by: Stephen Powell Cc: Jonathan Nieder Cc: stable@kernel.org Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/s390/mm/fault.c') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 9217e332b118..4cf85fef407c 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -558,9 +558,9 @@ static void pfault_interrupt(unsigned int ext_int_code, * Get the token (= address of the task structure of the affected task). */ #ifdef CONFIG_64BIT - tsk = *(struct task_struct **) param64; + tsk = (struct task_struct *) param64; #else - tsk = *(struct task_struct **) param32; + tsk = (struct task_struct *) param32; #endif if (subcode & 0x0080) { -- cgit v1.2.3 From a9851832857dc1e4efefca1713f5cff3e168a25c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 29 Apr 2011 10:42:19 +0200 Subject: [S390] irqstats: fix counting of pfault, dasd diag and virtio irqs pfault, dasd diag and virtio all use the same external interrupt number. The respective interrupt handlers decide by the subcode if they are meant to handle the interrupt. Counting is currently done before looking at the subcode which means each handler counts an interrupt even if it is not handling it. Fix this by moving the kstat code after the code which looks at the subcode. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390/mm/fault.c') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 4cf85fef407c..ab988135e5c6 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -543,7 +543,6 @@ static void pfault_interrupt(unsigned int ext_int_code, struct task_struct *tsk; __u16 subcode; - kstat_cpu(smp_processor_id()).irqs[EXTINT_PFL]++; /* * Get the external interruption subcode & pfault * initial/completion signal bit. VM stores this @@ -553,6 +552,7 @@ static void pfault_interrupt(unsigned int ext_int_code, subcode = ext_int_code >> 16; if ((subcode & 0xff00) != __SUBCODE_MASK) return; + kstat_cpu(smp_processor_id()).irqs[EXTINT_PFL]++; /* * Get the token (= address of the task structure of the affected task). -- cgit v1.2.3 From 043d07084b5347a26eab0a07aa13a4a929ad9e71 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 23 May 2011 10:24:23 +0200 Subject: [S390] Remove data execution protection The noexec support on s390 does not rely on a bit in the page table entry but utilizes the secondary space mode to distinguish between memory accesses for instructions vs. data. The noexec code relies on the assumption that the cpu will always use the secondary space page table for data accesses while it is running in the secondary space mode. Up to the z9-109 class machines this has been the case. Unfortunately this is not true anymore with z10 and later machines. The load-relative-long instructions lrl, lgrl and lgfrl access the memory operand using the same addressing-space mode that has been used to fetch the instruction. This breaks the noexec mode for all user space binaries compiled with march=z10 or later. The only option is to remove the current noexec support. Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 39 --------------------------------------- 1 file changed, 39 deletions(-) (limited to 'arch/s390/mm/fault.c') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index ab988135e5c6..177745c520ca 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -225,33 +225,6 @@ static noinline void do_sigbus(struct pt_regs *regs, long int_code, force_sig_info(SIGBUS, &si, tsk); } -#ifdef CONFIG_S390_EXEC_PROTECT -static noinline int signal_return(struct pt_regs *regs, long int_code, - unsigned long trans_exc_code) -{ - u16 instruction; - int rc; - - rc = __get_user(instruction, (u16 __user *) regs->psw.addr); - - if (!rc && instruction == 0x0a77) { - clear_tsk_thread_flag(current, TIF_PER_TRAP); - if (is_compat_task()) - sys32_sigreturn(); - else - sys_sigreturn(); - } else if (!rc && instruction == 0x0aad) { - clear_tsk_thread_flag(current, TIF_PER_TRAP); - if (is_compat_task()) - sys32_rt_sigreturn(); - else - sys_rt_sigreturn(); - } else - do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code); - return 0; -} -#endif /* CONFIG_S390_EXEC_PROTECT */ - static noinline void do_fault_error(struct pt_regs *regs, long int_code, unsigned long trans_exc_code, int fault) { @@ -259,13 +232,6 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code, switch (fault) { case VM_FAULT_BADACCESS: -#ifdef CONFIG_S390_EXEC_PROTECT - if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && - (trans_exc_code & 3) == 0) { - signal_return(regs, int_code, trans_exc_code); - break; - } -#endif /* CONFIG_S390_EXEC_PROTECT */ case VM_FAULT_BADMAP: /* Bad memory access. Check if it is kernel or user space. */ if (regs->psw.mask & PSW_MASK_PSTATE) { @@ -414,11 +380,6 @@ void __kprobes do_dat_exception(struct pt_regs *regs, long pgm_int_code, int access, fault; access = VM_READ | VM_EXEC | VM_WRITE; -#ifdef CONFIG_S390_EXEC_PROTECT - if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && - (trans_exc_code & 3) == 0) - access = VM_EXEC; -#endif fault = do_exception(regs, access, trans_exc_code); if (unlikely(fault)) do_fault_error(regs, pgm_int_code & 255, trans_exc_code, fault); -- cgit v1.2.3 From f2db2e6cb3f5f766cbb3788af44705685ff2445a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 23 May 2011 10:24:34 +0200 Subject: [S390] pfault: cpu hotplug vs missing completion interrupts On cpu hot remove a PFAULT CANCEL command is sent to the hypervisor which in turn will cancel all outstanding pfault requests that have been issued on that cpu (the same happens with a SIGP cpu reset). The result is that we end up with uninterruptible processes where the interrupt that would wake up these processes never arrives. In order to solve this all processes which wait for a pfault completion interrupt get woken up after a cpu hot remove. The worst case that could happen is that they fault again and in turn need to wait again. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 89 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 24 deletions(-) (limited to 'arch/s390/mm/fault.c') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 177745c520ca..1ca656478326 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -466,7 +466,7 @@ typedef struct { int pfault_init(void) { pfault_refbk_t refbk = - { 0x258, 0, 5, 2, __LC_CURRENT, 1ULL << 48, 1ULL << 48, + { 0x258, 0, 5, 2, __LC_CURRENT_PID, 1ULL << 48, 1ULL << 48, __PF_RES_FIELD }; int rc; @@ -498,11 +498,15 @@ void pfault_fini(void) : : "a" (&refbk), "m" (refbk) : "cc"); } +static DEFINE_SPINLOCK(pfault_lock); +static LIST_HEAD(pfault_list); + static void pfault_interrupt(unsigned int ext_int_code, unsigned int param32, unsigned long param64) { struct task_struct *tsk; __u16 subcode; + pid_t pid; /* * Get the external interruption subcode & pfault @@ -514,44 +518,79 @@ static void pfault_interrupt(unsigned int ext_int_code, if ((subcode & 0xff00) != __SUBCODE_MASK) return; kstat_cpu(smp_processor_id()).irqs[EXTINT_PFL]++; - - /* - * Get the token (= address of the task structure of the affected task). - */ -#ifdef CONFIG_64BIT - tsk = (struct task_struct *) param64; -#else - tsk = (struct task_struct *) param32; -#endif - + if (subcode & 0x0080) { + /* Get the token (= pid of the affected task). */ + pid = sizeof(void *) == 4 ? param32 : param64; + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) + return; + } else { + tsk = current; + } + spin_lock(&pfault_lock); if (subcode & 0x0080) { /* signal bit is set -> a page has been swapped in by VM */ - if (xchg(&tsk->thread.pfault_wait, -1) != 0) { + if (tsk->thread.pfault_wait == 1) { /* Initial interrupt was faster than the completion * interrupt. pfault_wait is valid. Set pfault_wait * back to zero and wake up the process. This can * safely be done because the task is still sleeping * and can't produce new pfaults. */ tsk->thread.pfault_wait = 0; + list_del(&tsk->thread.list); wake_up_process(tsk); - put_task_struct(tsk); + } else { + /* Completion interrupt was faster than initial + * interrupt. Set pfault_wait to -1 so the initial + * interrupt doesn't put the task to sleep. */ + tsk->thread.pfault_wait = -1; } + put_task_struct(tsk); } else { /* signal bit not set -> a real page is missing. */ - get_task_struct(tsk); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (xchg(&tsk->thread.pfault_wait, 1) != 0) { + if (tsk->thread.pfault_wait == -1) { /* Completion interrupt was faster than the initial - * interrupt (swapped in a -1 for pfault_wait). Set - * pfault_wait back to zero and exit. This can be - * done safely because tsk is running in kernel - * mode and can't produce new pfaults. */ + * interrupt (pfault_wait == -1). Set pfault_wait + * back to zero and exit. */ tsk->thread.pfault_wait = 0; - set_task_state(tsk, TASK_RUNNING); - put_task_struct(tsk); - } else + } else { + /* Initial interrupt arrived before completion + * interrupt. Let the task sleep. */ + tsk->thread.pfault_wait = 1; + list_add(&tsk->thread.list, &pfault_list); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); set_tsk_need_resched(tsk); + } + } + spin_unlock(&pfault_lock); +} + +static int __cpuinit pfault_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + struct thread_struct *thread, *next; + struct task_struct *tsk; + + switch (action) { + case CPU_DEAD: + case CPU_DEAD_FROZEN: + spin_lock_irq(&pfault_lock); + list_for_each_entry_safe(thread, next, &pfault_list, list) { + thread->pfault_wait = 0; + list_del(&thread->list); + tsk = container_of(thread, struct task_struct, thread); + wake_up_process(tsk); + } + spin_unlock_irq(&pfault_lock); + break; + default: + break; } + return NOTIFY_OK; } static int __init pfault_irq_init(void) @@ -568,8 +607,10 @@ static int __init pfault_irq_init(void) pfault_disable = 1; return rc; } - if (pfault_init() == 0) + if (pfault_init() == 0) { + hotcpu_notifier(pfault_cpu_notify, 0); return 0; + } /* Tough luck, no pfault. */ pfault_disable = 1; -- cgit v1.2.3 From 7dd8fe1f910f9644167ef91ddab44107d0d668c5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 23 May 2011 10:24:35 +0200 Subject: [S390] pfault: cleanup code Small code cleanup. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 67 +++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 30 deletions(-) (limited to 'arch/s390/mm/fault.c') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 1ca656478326..a0f9e730f26a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -452,22 +452,28 @@ static int __init nopfault(char *str) __setup("nopfault", nopfault); -typedef struct { - __u16 refdiagc; - __u16 reffcode; - __u16 refdwlen; - __u16 refversn; - __u64 refgaddr; - __u64 refselmk; - __u64 refcmpmk; - __u64 reserved; -} __attribute__ ((packed, aligned(8))) pfault_refbk_t; +struct pfault_refbk { + u16 refdiagc; + u16 reffcode; + u16 refdwlen; + u16 refversn; + u64 refgaddr; + u64 refselmk; + u64 refcmpmk; + u64 reserved; +} __attribute__ ((packed, aligned(8))); int pfault_init(void) { - pfault_refbk_t refbk = - { 0x258, 0, 5, 2, __LC_CURRENT_PID, 1ULL << 48, 1ULL << 48, - __PF_RES_FIELD }; + struct pfault_refbk refbk = { + .refdiagc = 0x258, + .reffcode = 0, + .refdwlen = 5, + .refversn = 2, + .refgaddr = __LC_CURRENT_PID, + .refselmk = 1ULL << 48, + .refcmpmk = 1ULL << 48, + .reserved = __PF_RES_FIELD }; int rc; if (!MACHINE_IS_VM || pfault_disable) @@ -485,8 +491,12 @@ int pfault_init(void) void pfault_fini(void) { - pfault_refbk_t refbk = - { 0x258, 1, 5, 2, 0ULL, 0ULL, 0ULL, 0ULL }; + struct pfault_refbk refbk = { + .refdiagc = 0x258, + .reffcode = 1, + .refdwlen = 5, + .refversn = 2, + }; if (!MACHINE_IS_VM || pfault_disable) return; @@ -599,24 +609,21 @@ static int __init pfault_irq_init(void) if (!MACHINE_IS_VM) return 0; - /* - * Try to get pfault pseudo page faults going. - */ rc = register_external_interrupt(0x2603, pfault_interrupt); - if (rc) { - pfault_disable = 1; - return rc; - } - if (pfault_init() == 0) { - hotcpu_notifier(pfault_cpu_notify, 0); - return 0; - } + if (rc) + goto out_extint; + rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; + if (rc) + goto out_pfault; + hotcpu_notifier(pfault_cpu_notify, 0); + return 0; - /* Tough luck, no pfault. */ - pfault_disable = 1; +out_pfault: unregister_external_interrupt(0x2603, pfault_interrupt); - return 0; +out_extint: + pfault_disable = 1; + return rc; } early_initcall(pfault_irq_init); -#endif +#endif /* CONFIG_PFAULT */ -- cgit v1.2.3 From 902050bcdece6191565c055539e82c5cc534feed Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 26 May 2011 09:48:22 +0200 Subject: [S390] pfault: always enable service signal interrupt Always enable the service signal subclass mask bit in cr0, if pfault is available. That way we use the normal cpu hotplug way to propagate the subclass mask bit in cr0 instead of open coding it. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/s390/mm/fault.c') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index a0f9e730f26a..e46ba2927424 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -485,7 +485,6 @@ int pfault_init(void) "2:\n" EX_TABLE(0b,1b) : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc"); - __ctl_set_bit(0, 9); return rc; } @@ -500,7 +499,6 @@ void pfault_fini(void) if (!MACHINE_IS_VM || pfault_disable) return; - __ctl_clear_bit(0,9); asm volatile( " diag %0,0,0x258\n" "0:\n" @@ -615,6 +613,7 @@ static int __init pfault_irq_init(void) rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; if (rc) goto out_pfault; + ctl_set_bit(0, 9); hotcpu_notifier(pfault_cpu_notify, 0); return 0; -- cgit v1.2.3 From df7997ab1ca82ae3c37a2f5eb98613fc24527f95 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 26 May 2011 09:48:23 +0200 Subject: [S390] irq: fix service signal external interrupt handling Interrupt sources like pfault, sclp, dasd_diag and virtio all use the service signal external interrupt subclass mask in control register 0 to enable and disable the corresponding interrupt. Because no reference counting is implemented each subsystem thinks it is the only user of subclass and sets and clears the bit like it wants. This leads to case that unloading the dasd diag module under z/VM causes both sclp and pfault interrupts to be masked. The result will be locked up system sooner or later. Fix this by introducing a new way to set (register) and clear (unregister) the service signal subclass mask bit in cr0. Also convert all drivers. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390/mm/fault.c') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index e46ba2927424..6e922b50efa4 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -613,7 +613,7 @@ static int __init pfault_irq_init(void) rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; if (rc) goto out_pfault; - ctl_set_bit(0, 9); + service_subclass_irq_register(); hotcpu_notifier(pfault_cpu_notify, 0); return 0; -- cgit v1.2.3 From d7b250e2a2d7f3cd23cf8d8d6689285e6f51a98d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 26 May 2011 09:48:24 +0200 Subject: [S390] irq: merge irq.c and s390_ext.c Merge irq.c and s390_ext.c into irq.c. That way all external interrupt related functions are together. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390/mm/fault.c') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 6e922b50efa4..105fa1071435 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include "../kernel/entry.h" -- cgit v1.2.3 From 99583181cbf2252dd0554eef6f419a6b22cd33ea Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 26 May 2011 09:48:29 +0200 Subject: [S390] mm: handle kernel caused page fault oom situations If e.g. copy_from_user() generates a page fault and the kernel runs into an OOM situation the system might lock up. If the OOM killer sends a SIG_KILL to the current process it can't handle it since it is stuck in a copy_from_user() - page fault loop. Fix this by adding the same fix as other architectures have. E.g. the x86 variant f86268 "x86/mm: Handle mm_fault_error() in kernel space" Signed-off-by: Heiko Carstens --- arch/s390/mm/fault.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/s390/mm/fault.c') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 105fa1071435..b57723aee848 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -245,9 +245,12 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code, do_no_context(regs, int_code, trans_exc_code); break; default: /* fault & VM_FAULT_ERROR */ - if (fault & VM_FAULT_OOM) - pagefault_out_of_memory(); - else if (fault & VM_FAULT_SIGBUS) { + if (fault & VM_FAULT_OOM) { + if (!(regs->psw.mask & PSW_MASK_PSTATE)) + do_no_context(regs, int_code, trans_exc_code); + else + pagefault_out_of_memory(); + } else if (fault & VM_FAULT_SIGBUS) { /* Kernel mode? Handle exceptions or die */ if (!(regs->psw.mask & PSW_MASK_PSTATE)) do_no_context(regs, int_code, trans_exc_code); @@ -429,10 +432,9 @@ int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write) access = write ? VM_WRITE : VM_READ; fault = do_exception(®s, access, uaddr | 2); if (unlikely(fault)) { - if (fault & VM_FAULT_OOM) { - pagefault_out_of_memory(); - fault = 0; - } else if (fault & VM_FAULT_SIGBUS) + if (fault & VM_FAULT_OOM) + return -EFAULT; + else if (fault & VM_FAULT_SIGBUS) do_sigbus(®s, pgm_int_code, uaddr); } return fault ? -EFAULT : 0; -- cgit v1.2.3 From 33ce614029576b8585e271fd7d90746a37114a15 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 26 May 2011 09:48:30 +0200 Subject: [S390] mm: add page fault retry handling s390 arch backend for d065bd81 "mm: retry page fault when blocking on disk transfer". Signed-off-by: Heiko Carstens --- arch/s390/mm/fault.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) (limited to 'arch/s390/mm/fault.c') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index b57723aee848..fe103e891e7a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -280,7 +280,8 @@ static inline int do_exception(struct pt_regs *regs, int access, struct mm_struct *mm; struct vm_area_struct *vma; unsigned long address; - int fault, write; + unsigned int flags; + int fault; if (notify_page_fault(regs)) return 0; @@ -299,6 +300,10 @@ static inline int do_exception(struct pt_regs *regs, int access, address = trans_exc_code & __FAIL_ADDR_MASK; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + flags = FAULT_FLAG_ALLOW_RETRY; + if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) + flags |= FAULT_FLAG_WRITE; +retry: down_read(&mm->mmap_sem); fault = VM_FAULT_BADMAP; @@ -328,21 +333,31 @@ static inline int do_exception(struct pt_regs *regs, int access, * make sure we exit gracefully rather than endlessly redo * the fault. */ - write = (access == VM_WRITE || - (trans_exc_code & store_indication) == 0x400) ? - FAULT_FLAG_WRITE : 0; - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, flags); if (unlikely(fault & VM_FAULT_ERROR)) goto out_up; - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, - regs, address); + /* + * Major/minor page fault accounting is only done on the + * initial attempt. If we go through a retry, it is extremely + * likely that the page will be found in page cache at that point. + */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault & VM_FAULT_MAJOR) { + tsk->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + regs, address); + } else { + tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + regs, address); + } + if (fault & VM_FAULT_RETRY) { + /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk + * of starvation. */ + flags &= ~FAULT_FLAG_ALLOW_RETRY; + goto retry; + } } /* * The instruction that caused the program check will -- cgit v1.2.3