From e35c76cd47c244eaa7a74adaabde4d0a1cadb907 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 20 Apr 2011 10:15:34 +0200
Subject: [S390] pfault: fix token handling

f6649a7e "[S390] cleanup lowcore access from external interrupts" changed
handling of external interrupts. Instead of letting the external interrupt
handlers accessing the per cpu lowcore the entry code of the kernel reads
already all fields that are necessary and passes them to the handlers.
The pfault interrupt handler was incorrectly converted. It tries to
dereference a value which used to be a pointer to a lowcore field. After
the conversion however it is not anymore the pointer to the field but its
content. So instead of a dereference only a cast is needed to get the
task pointer that caused the pfault.

Fixes a NULL pointer dereference and a subsequent kernel crash:

Unable to handle kernel pointer dereference at virtual kernel address (null)
Oops: 0004 [#1] SMP
Modules linked in: nfsd exportfs nfs lockd fscache nfs_acl auth_rpcgss sunrpc
                   loop qeth_l3 qeth vmur ccwgroup ext3 jbd mbcache dm_mod
                   dasd_eckd_mod dasd_diag_mod dasd_mod
CPU: 0 Not tainted 2.6.38-2-s390x #1
Process cron (pid: 1106, task: 000000001f962f78, ksp: 000000001fa0f9d0)
Krnl PSW : 0404200180000000 000000000002c03e (pfault_interrupt+0xa2/0x138)
           R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:0 CC:2 PM:0 EA:3
Krnl GPRS: 0000000000000000 0000000000000001 0000000000000000 0000000000000001
           000000001f962f78 0000000000518968 0000000090000002 000000001ff03280
           0000000000000000 000000000064f000 000000001f962f78 0000000000002603
           0000000006002603 0000000000000000 000000001ff7fe68 000000001ff7fe48
Krnl Code: 000000000002c036: 5820d010            l       %r2,16(%r13)
           000000000002c03a: 1832                lr      %r3,%r2
           000000000002c03c: 1a31                ar      %r3,%r1
          >000000000002c03e: ba23d010            cs      %r2,%r3,16(%r13)
           000000000002c042: a744fffc            brc     4,2c03a
           000000000002c046: a7290002            lghi    %r2,2
           000000000002c04a: e320d0000024        stg     %r2,0(%r13)
           000000000002c050: 07f0                bcr     15,%r0
Call Trace:
 ([<000000001f962f78>] 0x1f962f78)
  [<000000000001acda>] do_extint+0xf6/0x138
  [<000000000039b6ca>] ext_no_vtime+0x30/0x34
  [<000000007d706e04>] 0x7d706e04
Last Breaking-Event-Address:
  [<0000000000000000>] 0x0

For stable maintainers:
the first kernel which contains this bug is 2.6.37.

Reported-by: Stephen Powell <zlinuxman@wowway.com>
Cc: Jonathan Nieder <jrnieder@gmail.com>
Cc: stable@kernel.org
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/fault.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/s390/mm/fault.c')

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 9217e332b118..4cf85fef407c 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -558,9 +558,9 @@ static void pfault_interrupt(unsigned int ext_int_code,
 	 * Get the token (= address of the task structure of the affected task).
 	 */
 #ifdef CONFIG_64BIT
-	tsk = *(struct task_struct **) param64;
+	tsk = (struct task_struct *) param64;
 #else
-	tsk = *(struct task_struct **) param32;
+	tsk = (struct task_struct *) param32;
 #endif
 
 	if (subcode & 0x0080) {
-- 
cgit v1.2.3


From a9851832857dc1e4efefca1713f5cff3e168a25c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Fri, 29 Apr 2011 10:42:19 +0200
Subject: [S390] irqstats: fix counting of pfault, dasd diag and virtio irqs

pfault, dasd diag and virtio all use the same external interrupt number.
The respective interrupt handlers decide by the subcode if they are
meant to handle the interrupt.
Counting is currently done before looking at the subcode which means
each handler counts an interrupt even if it is not handling it.
Fix this by moving the kstat code after the code which looks at the
subcode.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/s390/mm/fault.c')

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 4cf85fef407c..ab988135e5c6 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -543,7 +543,6 @@ static void pfault_interrupt(unsigned int ext_int_code,
 	struct task_struct *tsk;
 	__u16 subcode;
 
-	kstat_cpu(smp_processor_id()).irqs[EXTINT_PFL]++;
 	/*
 	 * Get the external interruption subcode & pfault
 	 * initial/completion signal bit. VM stores this 
@@ -553,6 +552,7 @@ static void pfault_interrupt(unsigned int ext_int_code,
 	subcode = ext_int_code >> 16;
 	if ((subcode & 0xff00) != __SUBCODE_MASK)
 		return;
+	kstat_cpu(smp_processor_id()).irqs[EXTINT_PFL]++;
 
 	/*
 	 * Get the token (= address of the task structure of the affected task).
-- 
cgit v1.2.3


From 043d07084b5347a26eab0a07aa13a4a929ad9e71 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Mon, 23 May 2011 10:24:23 +0200
Subject: [S390] Remove data execution protection

The noexec support on s390 does not rely on a bit in the page table
entry but utilizes the secondary space mode to distinguish between
memory accesses for instructions vs. data. The noexec code relies
on the assumption that the cpu will always use the secondary space
page table for data accesses while it is running in the secondary
space mode. Up to the z9-109 class machines this has been the case.
Unfortunately this is not true anymore with z10 and later machines.
The load-relative-long instructions lrl, lgrl and lgfrl access the
memory operand using the same addressing-space mode that has been
used to fetch the instruction.
This breaks the noexec mode for all user space binaries compiled
with march=z10 or later. The only option is to remove the current
noexec support.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/fault.c | 39 ---------------------------------------
 1 file changed, 39 deletions(-)

(limited to 'arch/s390/mm/fault.c')

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index ab988135e5c6..177745c520ca 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -225,33 +225,6 @@ static noinline void do_sigbus(struct pt_regs *regs, long int_code,
 	force_sig_info(SIGBUS, &si, tsk);
 }
 
-#ifdef CONFIG_S390_EXEC_PROTECT
-static noinline int signal_return(struct pt_regs *regs, long int_code,
-				  unsigned long trans_exc_code)
-{
-	u16 instruction;
-	int rc;
-
-	rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
-
-	if (!rc && instruction == 0x0a77) {
-		clear_tsk_thread_flag(current, TIF_PER_TRAP);
-		if (is_compat_task())
-			sys32_sigreturn();
-		else
-			sys_sigreturn();
-	} else if (!rc && instruction == 0x0aad) {
-		clear_tsk_thread_flag(current, TIF_PER_TRAP);
-		if (is_compat_task())
-			sys32_rt_sigreturn();
-		else
-			sys_rt_sigreturn();
-	} else
-		do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code);
-	return 0;
-}
-#endif /* CONFIG_S390_EXEC_PROTECT */
-
 static noinline void do_fault_error(struct pt_regs *regs, long int_code,
 				    unsigned long trans_exc_code, int fault)
 {
@@ -259,13 +232,6 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code,
 
 	switch (fault) {
 	case VM_FAULT_BADACCESS:
-#ifdef CONFIG_S390_EXEC_PROTECT
-		if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY &&
-		    (trans_exc_code & 3) == 0) {
-			signal_return(regs, int_code, trans_exc_code);
-			break;
-		}
-#endif /* CONFIG_S390_EXEC_PROTECT */
 	case VM_FAULT_BADMAP:
 		/* Bad memory access. Check if it is kernel or user space. */
 		if (regs->psw.mask & PSW_MASK_PSTATE) {
@@ -414,11 +380,6 @@ void __kprobes do_dat_exception(struct pt_regs *regs, long pgm_int_code,
 	int access, fault;
 
 	access = VM_READ | VM_EXEC | VM_WRITE;
-#ifdef CONFIG_S390_EXEC_PROTECT
-	if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY &&
-	    (trans_exc_code & 3) == 0)
-		access = VM_EXEC;
-#endif
 	fault = do_exception(regs, access, trans_exc_code);
 	if (unlikely(fault))
 		do_fault_error(regs, pgm_int_code & 255, trans_exc_code, fault);
-- 
cgit v1.2.3


From f2db2e6cb3f5f766cbb3788af44705685ff2445a Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 23 May 2011 10:24:34 +0200
Subject: [S390] pfault: cpu hotplug vs missing completion interrupts

On cpu hot remove a PFAULT CANCEL command is sent to the hypervisor
which in turn will cancel all outstanding pfault requests that have
been issued on that cpu (the same happens with a SIGP cpu reset).

The result is that we end up with uninterruptible processes where
the interrupt that would wake up these processes never arrives.

In order to solve this all processes which wait for a pfault
completion interrupt get woken up after a cpu hot remove. The worst
case that could happen is that they fault again and in turn need to
wait again.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/fault.c | 89 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 65 insertions(+), 24 deletions(-)

(limited to 'arch/s390/mm/fault.c')

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 177745c520ca..1ca656478326 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -466,7 +466,7 @@ typedef struct {
 int pfault_init(void)
 {
 	pfault_refbk_t refbk =
-		{ 0x258, 0, 5, 2, __LC_CURRENT, 1ULL << 48, 1ULL << 48,
+		{ 0x258, 0, 5, 2, __LC_CURRENT_PID, 1ULL << 48, 1ULL << 48,
 		  __PF_RES_FIELD };
         int rc;
 
@@ -498,11 +498,15 @@ void pfault_fini(void)
 		: : "a" (&refbk), "m" (refbk) : "cc");
 }
 
+static DEFINE_SPINLOCK(pfault_lock);
+static LIST_HEAD(pfault_list);
+
 static void pfault_interrupt(unsigned int ext_int_code,
 			     unsigned int param32, unsigned long param64)
 {
 	struct task_struct *tsk;
 	__u16 subcode;
+	pid_t pid;
 
 	/*
 	 * Get the external interruption subcode & pfault
@@ -514,44 +518,79 @@ static void pfault_interrupt(unsigned int ext_int_code,
 	if ((subcode & 0xff00) != __SUBCODE_MASK)
 		return;
 	kstat_cpu(smp_processor_id()).irqs[EXTINT_PFL]++;
-
-	/*
-	 * Get the token (= address of the task structure of the affected task).
-	 */
-#ifdef CONFIG_64BIT
-	tsk = (struct task_struct *) param64;
-#else
-	tsk = (struct task_struct *) param32;
-#endif
-
+	if (subcode & 0x0080) {
+		/* Get the token (= pid of the affected task). */
+		pid = sizeof(void *) == 4 ? param32 : param64;
+		rcu_read_lock();
+		tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+		if (tsk)
+			get_task_struct(tsk);
+		rcu_read_unlock();
+		if (!tsk)
+			return;
+	} else {
+		tsk = current;
+	}
+	spin_lock(&pfault_lock);
 	if (subcode & 0x0080) {
 		/* signal bit is set -> a page has been swapped in by VM */
-		if (xchg(&tsk->thread.pfault_wait, -1) != 0) {
+		if (tsk->thread.pfault_wait == 1) {
 			/* Initial interrupt was faster than the completion
 			 * interrupt. pfault_wait is valid. Set pfault_wait
 			 * back to zero and wake up the process. This can
 			 * safely be done because the task is still sleeping
 			 * and can't produce new pfaults. */
 			tsk->thread.pfault_wait = 0;
+			list_del(&tsk->thread.list);
 			wake_up_process(tsk);
-			put_task_struct(tsk);
+		} else {
+			/* Completion interrupt was faster than initial
+			 * interrupt. Set pfault_wait to -1 so the initial
+			 * interrupt doesn't put the task to sleep. */
+			tsk->thread.pfault_wait = -1;
 		}
+		put_task_struct(tsk);
 	} else {
 		/* signal bit not set -> a real page is missing. */
-		get_task_struct(tsk);
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-		if (xchg(&tsk->thread.pfault_wait, 1) != 0) {
+		if (tsk->thread.pfault_wait == -1) {
 			/* Completion interrupt was faster than the initial
-			 * interrupt (swapped in a -1 for pfault_wait). Set
-			 * pfault_wait back to zero and exit. This can be
-			 * done safely because tsk is running in kernel 
-			 * mode and can't produce new pfaults. */
+			 * interrupt (pfault_wait == -1). Set pfault_wait
+			 * back to zero and exit. */
 			tsk->thread.pfault_wait = 0;
-			set_task_state(tsk, TASK_RUNNING);
-			put_task_struct(tsk);
-		} else
+		} else {
+			/* Initial interrupt arrived before completion
+			 * interrupt. Let the task sleep. */
+			tsk->thread.pfault_wait = 1;
+			list_add(&tsk->thread.list, &pfault_list);
+			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 			set_tsk_need_resched(tsk);
+		}
+	}
+	spin_unlock(&pfault_lock);
+}
+
+static int __cpuinit pfault_cpu_notify(struct notifier_block *self,
+				       unsigned long action, void *hcpu)
+{
+	struct thread_struct *thread, *next;
+	struct task_struct *tsk;
+
+	switch (action) {
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		spin_lock_irq(&pfault_lock);
+		list_for_each_entry_safe(thread, next, &pfault_list, list) {
+			thread->pfault_wait = 0;
+			list_del(&thread->list);
+			tsk = container_of(thread, struct task_struct, thread);
+			wake_up_process(tsk);
+		}
+		spin_unlock_irq(&pfault_lock);
+		break;
+	default:
+		break;
 	}
+	return NOTIFY_OK;
 }
 
 static int __init pfault_irq_init(void)
@@ -568,8 +607,10 @@ static int __init pfault_irq_init(void)
 		pfault_disable = 1;
 		return rc;
 	}
-	if (pfault_init() == 0)
+	if (pfault_init() == 0) {
+		hotcpu_notifier(pfault_cpu_notify, 0);
 		return 0;
+	}
 
 	/* Tough luck, no pfault. */
 	pfault_disable = 1;
-- 
cgit v1.2.3


From 7dd8fe1f910f9644167ef91ddab44107d0d668c5 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 23 May 2011 10:24:35 +0200
Subject: [S390] pfault: cleanup code

Small code cleanup.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/fault.c | 67 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 37 insertions(+), 30 deletions(-)

(limited to 'arch/s390/mm/fault.c')

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 1ca656478326..a0f9e730f26a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -452,22 +452,28 @@ static int __init nopfault(char *str)
 
 __setup("nopfault", nopfault);
 
-typedef struct {
-	__u16 refdiagc;
-	__u16 reffcode;
-	__u16 refdwlen;
-	__u16 refversn;
-	__u64 refgaddr;
-	__u64 refselmk;
-	__u64 refcmpmk;
-	__u64 reserved;
-} __attribute__ ((packed, aligned(8))) pfault_refbk_t;
+struct pfault_refbk {
+	u16 refdiagc;
+	u16 reffcode;
+	u16 refdwlen;
+	u16 refversn;
+	u64 refgaddr;
+	u64 refselmk;
+	u64 refcmpmk;
+	u64 reserved;
+} __attribute__ ((packed, aligned(8)));
 
 int pfault_init(void)
 {
-	pfault_refbk_t refbk =
-		{ 0x258, 0, 5, 2, __LC_CURRENT_PID, 1ULL << 48, 1ULL << 48,
-		  __PF_RES_FIELD };
+	struct pfault_refbk refbk = {
+		.refdiagc = 0x258,
+		.reffcode = 0,
+		.refdwlen = 5,
+		.refversn = 2,
+		.refgaddr = __LC_CURRENT_PID,
+		.refselmk = 1ULL << 48,
+		.refcmpmk = 1ULL << 48,
+		.reserved = __PF_RES_FIELD };
         int rc;
 
 	if (!MACHINE_IS_VM || pfault_disable)
@@ -485,8 +491,12 @@ int pfault_init(void)
 
 void pfault_fini(void)
 {
-	pfault_refbk_t refbk =
-	{ 0x258, 1, 5, 2, 0ULL, 0ULL, 0ULL, 0ULL };
+	struct pfault_refbk refbk = {
+		.refdiagc = 0x258,
+		.reffcode = 1,
+		.refdwlen = 5,
+		.refversn = 2,
+	};
 
 	if (!MACHINE_IS_VM || pfault_disable)
 		return;
@@ -599,24 +609,21 @@ static int __init pfault_irq_init(void)
 
 	if (!MACHINE_IS_VM)
 		return 0;
-	/*
-	 * Try to get pfault pseudo page faults going.
-	 */
 	rc = register_external_interrupt(0x2603, pfault_interrupt);
-	if (rc) {
-		pfault_disable = 1;
-		return rc;
-	}
-	if (pfault_init() == 0) {
-		hotcpu_notifier(pfault_cpu_notify, 0);
-		return 0;
-	}
+	if (rc)
+		goto out_extint;
+	rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
+	if (rc)
+		goto out_pfault;
+	hotcpu_notifier(pfault_cpu_notify, 0);
+	return 0;
 
-	/* Tough luck, no pfault. */
-	pfault_disable = 1;
+out_pfault:
 	unregister_external_interrupt(0x2603, pfault_interrupt);
-	return 0;
+out_extint:
+	pfault_disable = 1;
+	return rc;
 }
 early_initcall(pfault_irq_init);
 
-#endif
+#endif /* CONFIG_PFAULT */
-- 
cgit v1.2.3


From 902050bcdece6191565c055539e82c5cc534feed Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 26 May 2011 09:48:22 +0200
Subject: [S390] pfault: always enable service signal interrupt

Always enable the service signal subclass mask bit in cr0, if pfault
is available. That way we use the normal cpu hotplug way to propagate
the subclass mask bit in cr0 instead of open coding it.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/fault.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/s390/mm/fault.c')

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index a0f9e730f26a..e46ba2927424 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -485,7 +485,6 @@ int pfault_init(void)
 		"2:\n"
 		EX_TABLE(0b,1b)
 		: "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc");
-        __ctl_set_bit(0, 9);
         return rc;
 }
 
@@ -500,7 +499,6 @@ void pfault_fini(void)
 
 	if (!MACHINE_IS_VM || pfault_disable)
 		return;
-	__ctl_clear_bit(0,9);
 	asm volatile(
 		"	diag	%0,0,0x258\n"
 		"0:\n"
@@ -615,6 +613,7 @@ static int __init pfault_irq_init(void)
 	rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
 	if (rc)
 		goto out_pfault;
+	ctl_set_bit(0, 9);
 	hotcpu_notifier(pfault_cpu_notify, 0);
 	return 0;
 
-- 
cgit v1.2.3


From df7997ab1ca82ae3c37a2f5eb98613fc24527f95 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 26 May 2011 09:48:23 +0200
Subject: [S390] irq: fix service signal external interrupt handling

Interrupt sources like pfault, sclp, dasd_diag and virtio all use the
service signal external interrupt subclass mask in control register 0
to enable and disable the corresponding interrupt.
Because no reference counting is implemented each subsystem thinks it
is the only user of subclass and sets and clears the bit like it wants.
This leads to case that unloading the dasd diag module under z/VM
causes both sclp and pfault interrupts to be masked. The result will
be locked up system sooner or later.
Fix this by introducing a new way to set (register) and clear
(unregister) the service signal subclass mask bit in cr0.
Also convert all drivers.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/s390/mm/fault.c')

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index e46ba2927424..6e922b50efa4 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -613,7 +613,7 @@ static int __init pfault_irq_init(void)
 	rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
 	if (rc)
 		goto out_pfault;
-	ctl_set_bit(0, 9);
+	service_subclass_irq_register();
 	hotcpu_notifier(pfault_cpu_notify, 0);
 	return 0;
 
-- 
cgit v1.2.3


From d7b250e2a2d7f3cd23cf8d8d6689285e6f51a98d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 26 May 2011 09:48:24 +0200
Subject: [S390] irq: merge irq.c and s390_ext.c

Merge irq.c and s390_ext.c into irq.c. That way all external interrupt
related functions are together.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/s390/mm/fault.c')

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 6e922b50efa4..105fa1071435 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -34,7 +34,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/system.h>
 #include <asm/pgtable.h>
-#include <asm/s390_ext.h>
+#include <asm/irq.h>
 #include <asm/mmu_context.h>
 #include <asm/compat.h>
 #include "../kernel/entry.h"
-- 
cgit v1.2.3


From 99583181cbf2252dd0554eef6f419a6b22cd33ea Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 26 May 2011 09:48:29 +0200
Subject: [S390] mm: handle kernel caused page fault oom situations

If e.g. copy_from_user() generates a page fault and the kernel runs
into an OOM situation the system might lock up.
If the OOM killer sends a SIG_KILL to the current process it can't
handle it since it is stuck in a copy_from_user() - page fault loop.

Fix this by adding the same fix as other architectures have.

E.g. the x86 variant f86268 "x86/mm: Handle mm_fault_error() in kernel
space"

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/mm/fault.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch/s390/mm/fault.c')

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 105fa1071435..b57723aee848 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -245,9 +245,12 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code,
 		do_no_context(regs, int_code, trans_exc_code);
 		break;
 	default: /* fault & VM_FAULT_ERROR */
-		if (fault & VM_FAULT_OOM)
-			pagefault_out_of_memory();
-		else if (fault & VM_FAULT_SIGBUS) {
+		if (fault & VM_FAULT_OOM) {
+			if (!(regs->psw.mask & PSW_MASK_PSTATE))
+				do_no_context(regs, int_code, trans_exc_code);
+			else
+				pagefault_out_of_memory();
+		} else if (fault & VM_FAULT_SIGBUS) {
 			/* Kernel mode? Handle exceptions or die */
 			if (!(regs->psw.mask & PSW_MASK_PSTATE))
 				do_no_context(regs, int_code, trans_exc_code);
@@ -429,10 +432,9 @@ int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write)
 	access = write ? VM_WRITE : VM_READ;
 	fault = do_exception(&regs, access, uaddr | 2);
 	if (unlikely(fault)) {
-		if (fault & VM_FAULT_OOM) {
-			pagefault_out_of_memory();
-			fault = 0;
-		} else if (fault & VM_FAULT_SIGBUS)
+		if (fault & VM_FAULT_OOM)
+			return -EFAULT;
+		else if (fault & VM_FAULT_SIGBUS)
 			do_sigbus(&regs, pgm_int_code, uaddr);
 	}
 	return fault ? -EFAULT : 0;
-- 
cgit v1.2.3


From 33ce614029576b8585e271fd7d90746a37114a15 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 26 May 2011 09:48:30 +0200
Subject: [S390] mm: add page fault retry handling

s390 arch backend for d065bd81 "mm: retry page fault when blocking on
disk transfer".

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/mm/fault.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

(limited to 'arch/s390/mm/fault.c')

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index b57723aee848..fe103e891e7a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -280,7 +280,8 @@ static inline int do_exception(struct pt_regs *regs, int access,
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
 	unsigned long address;
-	int fault, write;
+	unsigned int flags;
+	int fault;
 
 	if (notify_page_fault(regs))
 		return 0;
@@ -299,6 +300,10 @@ static inline int do_exception(struct pt_regs *regs, int access,
 
 	address = trans_exc_code & __FAIL_ADDR_MASK;
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	flags = FAULT_FLAG_ALLOW_RETRY;
+	if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
+		flags |= FAULT_FLAG_WRITE;
+retry:
 	down_read(&mm->mmap_sem);
 
 	fault = VM_FAULT_BADMAP;
@@ -328,21 +333,31 @@ static inline int do_exception(struct pt_regs *regs, int access,
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	write = (access == VM_WRITE ||
-		 (trans_exc_code & store_indication) == 0x400) ?
-		FAULT_FLAG_WRITE : 0;
-	fault = handle_mm_fault(mm, vma, address, write);
+	fault = handle_mm_fault(mm, vma, address, flags);
 	if (unlikely(fault & VM_FAULT_ERROR))
 		goto out_up;
 
-	if (fault & VM_FAULT_MAJOR) {
-		tsk->maj_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
-				     regs, address);
-	} else {
-		tsk->min_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
-				     regs, address);
+	/*
+	 * Major/minor page fault accounting is only done on the
+	 * initial attempt. If we go through a retry, it is extremely
+	 * likely that the page will be found in page cache at that point.
+	 */
+	if (flags & FAULT_FLAG_ALLOW_RETRY) {
+		if (fault & VM_FAULT_MAJOR) {
+			tsk->maj_flt++;
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+				      regs, address);
+		} else {
+			tsk->min_flt++;
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+				      regs, address);
+		}
+		if (fault & VM_FAULT_RETRY) {
+			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+			 * of starvation. */
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			goto retry;
+		}
 	}
 	/*
 	 * The instruction that caused the program check will
-- 
cgit v1.2.3