From c7b96acf1456ef127fef461fcfedb54b81fecfbb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 20 Mar 2013 12:49:49 -0700 Subject: userns: Kill nsown_capable it makes the wrong thing easy nsown_capable is a special case of ns_capable essentially for just CAP_SETUID and CAP_SETGID. For the existing users it doesn't noticably simplify things and from the suggested patches I have seen it encourages people to do the wrong thing. So remove nsown_capable. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- ipc/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'ipc') diff --git a/ipc/namespace.c b/ipc/namespace.c index 7ee61bf44933..4be6581d3b7f 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -171,7 +171,7 @@ static int ipcns_install(struct nsproxy *nsproxy, void *new) { struct ipc_namespace *ns = new; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; /* Ditch state from the old ipc namespace */ -- cgit v1.2.3 From bebcb928c820d0ee83aca4b192adc195e43e66a2 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Tue, 3 Sep 2013 16:00:08 +0200 Subject: ipc/msg.c: Fix lost wakeup in msgsnd(). The check if the queue is full and adding current to the wait queue of pending msgsnd() operations (ss_add()) must be atomic. Otherwise: - the thread that performs msgsnd() finds a full queue and decides to sleep. - the thread that performs msgrcv() first reads all messages from the queue and then sleeps, because the queue is empty. - the msgrcv() calls do not perform any wakeups, because the msgsnd() task has not yet called ss_add(). - then the msgsnd()-thread first calls ss_add() and then sleeps. Net result: msgsnd() and msgrcv() both sleep forever. Observed with msgctl08 from ltp with a preemptible kernel. Fix: Call ipc_lock_object() before performing the check. The patch also moves security_msg_queue_msgsnd() under ipc_lock_object: - msgctl(IPC_SET) explicitely mentions that it tries to expunge any pending operations that are not allowed anymore with the new permissions. If security_msg_queue_msgsnd() is called without locks, then there might be races. - it makes the patch much simpler. Reported-and-tested-by: Vineet Gupta Acked-by: Rik van Riel Cc: stable@vger.kernel.org # for 3.11 Signed-off-by: Manfred Spraul Signed-off-by: Linus Torvalds --- ipc/msg.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'ipc') diff --git a/ipc/msg.c b/ipc/msg.c index 9f29d9e89bac..b65fdf1a09dd 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -680,16 +680,18 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, goto out_unlock1; } + ipc_lock_object(&msq->q_perm); + for (;;) { struct msg_sender s; err = -EACCES; if (ipcperms(ns, &msq->q_perm, S_IWUGO)) - goto out_unlock1; + goto out_unlock0; err = security_msg_queue_msgsnd(msq, msg, msgflg); if (err) - goto out_unlock1; + goto out_unlock0; if (msgsz + msq->q_cbytes <= msq->q_qbytes && 1 + msq->q_qnum <= msq->q_qbytes) { @@ -699,10 +701,9 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, /* queue full, wait: */ if (msgflg & IPC_NOWAIT) { err = -EAGAIN; - goto out_unlock1; + goto out_unlock0; } - ipc_lock_object(&msq->q_perm); ss_add(msq, &s); if (!ipc_rcu_getref(msq)) { @@ -730,10 +731,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, goto out_unlock0; } - ipc_unlock_object(&msq->q_perm); } - - ipc_lock_object(&msq->q_perm); msq->q_lspid = task_tgid_vnr(current); msq->q_stime = get_seconds(); -- cgit v1.2.3 From 8b8d52ac382b17a19906b930cd69e2edb0aca8ba Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:15 -0700 Subject: ipc,shm: introduce lockless functions to obtain the ipc object This is the third and final patchset that deals with reducing the amount of contention we impose on the ipc lock (kern_ipc_perm.lock). These changes mostly deal with shared memory, previous work has already been done for semaphores and message queues: http://lkml.org/lkml/2013/3/20/546 (sems) http://lkml.org/lkml/2013/5/15/584 (mqueues) With these patches applied, a custom shm microbenchmark stressing shmctl doing IPC_STAT with 4 threads a million times, reduces the execution time by 50%. A similar run, this time with IPC_SET, reduces the execution time from 3 mins and 35 secs to 27 seconds. Patches 1-8: replaces blindly taking the ipc lock for a smarter combination of rcu and ipc_obtain_object, only acquiring the spinlock when updating. Patch 9: renames the ids rw_mutex to rwsem, which is what it already was. Patch 10: is a trivial mqueue leftover cleanup Patch 11: adds a brief lock scheme description, requested by Andrew. This patch: Add shm_obtain_object() and shm_obtain_object_check(), which will allow us to get the ipc object without acquiring the lock. Just as with other forms of ipc, these functions are basically wrappers around ipc_obtain_object*(). Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'ipc') diff --git a/ipc/shm.c b/ipc/shm.c index c6b4ad5ce3b7..216ae727a936 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -124,6 +124,26 @@ void __init shm_init (void) IPC_SHM_IDS, sysvipc_shm_proc_show); } +static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_obtain_object(&shm_ids(ns), id); + + if (IS_ERR(ipcp)) + return ERR_CAST(ipcp); + + return container_of(ipcp, struct shmid_kernel, shm_perm); +} + +static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id); + + if (IS_ERR(ipcp)) + return ERR_CAST(ipcp); + + return container_of(ipcp, struct shmid_kernel, shm_perm); +} + /* * shm_lock_(check_) routines are called in the paths where the rw_mutex * is not necessarily held. -- cgit v1.2.3 From 79ccf0f8c8e04e8b9eda6645ba0f63b0915a3075 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:16 -0700 Subject: ipc,shm: shorten critical region in shmctl_down Instead of holding the ipc lock for the entire function, use the ipcctl_pre_down_nolock and only acquire the lock for specific commands: RMID and SET. Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'ipc') diff --git a/ipc/shm.c b/ipc/shm.c index 216ae727a936..22cffd78dbb1 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -780,11 +780,10 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, down_write(&shm_ids(ns).rw_mutex); rcu_read_lock(); - ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd, - &shmid64.shm_perm, 0); + ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd, + &shmid64.shm_perm, 0); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); - /* the ipc lock is not held upon failure */ goto out_unlock1; } @@ -792,14 +791,16 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, err = security_shm_shmctl(shp, cmd); if (err) - goto out_unlock0; + goto out_unlock1; switch (cmd) { case IPC_RMID: + ipc_lock_object(&shp->shm_perm); /* do_shm_rmid unlocks the ipc object and rcu */ do_shm_rmid(ns, ipcp); goto out_up; case IPC_SET: + ipc_lock_object(&shp->shm_perm); err = ipc_update_perm(&shmid64.shm_perm, ipcp); if (err) goto out_unlock0; @@ -807,6 +808,7 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, break; default: err = -EINVAL; + goto out_unlock1; } out_unlock0: -- cgit v1.2.3 From 3b1c4ad37741e53804ffe0a30dd01e08b2ab6241 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:17 -0700 Subject: ipc: drop ipcctl_pre_down Now that sem, msgque and shm, through *_down(), all use the lockless variant of ipcctl_pre_down(), go ahead and delete it. [akpm@linux-foundation.org: fix function name in kerneldoc, cleanups] Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.c | 24 ++++-------------------- ipc/util.h | 3 --- 2 files changed, 4 insertions(+), 23 deletions(-) (limited to 'ipc') diff --git a/ipc/util.c b/ipc/util.c index 4704223bfad4..2c8a93b380ba 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -733,7 +733,7 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) } /** - * ipcctl_pre_down - retrieve an ipc and check permissions for some IPC_XXX cmd + * ipcctl_pre_down_nolock - retrieve an ipc and check permissions for some IPC_XXX cmd * @ns: the ipc namespace * @ids: the table of ids where to look for the ipc * @id: the id of the ipc to retrieve @@ -746,29 +746,13 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) * It must be called without any lock held and * - retrieves the ipc with the given id in the given table. * - performs some audit and permission check, depending on the given cmd - * - returns the ipc with the ipc lock held in case of success - * or an err-code without any lock held otherwise. + * - returns a pointer to the ipc object or otherwise, the corresponding error. * * Call holding the both the rw_mutex and the rcu read lock. */ -struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, - struct ipc_ids *ids, int id, int cmd, - struct ipc64_perm *perm, int extra_perm) -{ - struct kern_ipc_perm *ipcp; - - ipcp = ipcctl_pre_down_nolock(ns, ids, id, cmd, perm, extra_perm); - if (IS_ERR(ipcp)) - goto out; - - spin_lock(&ipcp->lock); -out: - return ipcp; -} - struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, - struct ipc_ids *ids, int id, int cmd, - struct ipc64_perm *perm, int extra_perm) + struct ipc_ids *ids, int id, int cmd, + struct ipc64_perm *perm, int extra_perm) { kuid_t euid; int err = -EPERM; diff --git a/ipc/util.h b/ipc/util.h index b6a6a88f3002..41a6c4d26399 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -131,9 +131,6 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out); struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, struct ipc64_perm *perm, int extra_perm); -struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, - struct ipc_ids *ids, int id, int cmd, - struct ipc64_perm *perm, int extra_perm); #ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION /* On IA-64, we always use the "64-bit version" of the IPC structures. */ -- cgit v1.2.3 From 68eccc1dc345539d589ae78ee43b835c1a06a134 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:18 -0700 Subject: ipc,shm: introduce shmctl_nolock Similar to semctl and msgctl, when calling msgctl, the *_INFO and *_STAT commands can be performed without acquiring the ipc object. Add a shmctl_nolock() function and move the logic of *_INFO and *_STAT out of msgctl(). Since we are just moving functionality, this change still takes the lock and it will be properly lockless in the next patch. Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 57 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 18 deletions(-) (limited to 'ipc') diff --git a/ipc/shm.c b/ipc/shm.c index 22cffd78dbb1..3e123987f054 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -820,29 +820,24 @@ out_up: return err; } -SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) +static int shmctl_nolock(struct ipc_namespace *ns, int shmid, + int cmd, int version, void __user *buf) { + int err; struct shmid_kernel *shp; - int err, version; - struct ipc_namespace *ns; - if (cmd < 0 || shmid < 0) { - err = -EINVAL; - goto out; + /* preliminary security checks for *_INFO */ + if (cmd == IPC_INFO || cmd == SHM_INFO) { + err = security_shm_shmctl(NULL, cmd); + if (err) + return err; } - version = ipc_parse_version(&cmd); - ns = current->nsproxy->ipc_ns; - - switch (cmd) { /* replace with proc interface ? */ + switch (cmd) { case IPC_INFO: { struct shminfo64 shminfo; - err = security_shm_shmctl(NULL, cmd); - if (err) - return err; - memset(&shminfo, 0, sizeof(shminfo)); shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni; shminfo.shmmax = ns->shm_ctlmax; @@ -864,10 +859,6 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) { struct shm_info shm_info; - err = security_shm_shmctl(NULL, cmd); - if (err) - return err; - memset(&shm_info, 0, sizeof(shm_info)); down_read(&shm_ids(ns).rw_mutex); shm_info.used_ids = shm_ids(ns).in_use; @@ -928,6 +919,36 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) err = result; goto out; } + default: + return -EINVAL; + } + +out_unlock: + shm_unlock(shp); +out: + return err; +} + +SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) +{ + struct shmid_kernel *shp; + int err, version; + struct ipc_namespace *ns; + + if (cmd < 0 || shmid < 0) { + err = -EINVAL; + goto out; + } + + version = ipc_parse_version(&cmd); + ns = current->nsproxy->ipc_ns; + + switch (cmd) { + case IPC_INFO: + case SHM_INFO: + case SHM_STAT: + case IPC_STAT: + return shmctl_nolock(ns, shmid, cmd, version, buf); case SHM_LOCK: case SHM_UNLOCK: { -- cgit v1.2.3 From c97cb9ccab8c85428ec21eff690642ad2ce1fa8a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:20 -0700 Subject: ipc,shm: make shmctl_nolock lockless While the INFO cmd doesn't take the ipc lock, the STAT commands do acquire it unnecessarily. We can do the permissions and security checks only holding the rcu lock. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'ipc') diff --git a/ipc/shm.c b/ipc/shm.c index 3e123987f054..a493639550d9 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -882,27 +882,31 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid, struct shmid64_ds tbuf; int result; + rcu_read_lock(); if (cmd == SHM_STAT) { - shp = shm_lock(ns, shmid); + shp = shm_obtain_object(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } result = shp->shm_perm.id; } else { - shp = shm_lock_check(ns, shmid); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } result = 0; } + err = -EACCES; if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) goto out_unlock; + err = security_shm_shmctl(shp, cmd); if (err) goto out_unlock; + memset(&tbuf, 0, sizeof(tbuf)); kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm); tbuf.shm_segsz = shp->shm_segsz; @@ -912,8 +916,9 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid, tbuf.shm_cpid = shp->shm_cprid; tbuf.shm_lpid = shp->shm_lprid; tbuf.shm_nattch = shp->shm_nattch; - shm_unlock(shp); - if(copy_shmid_to_user (buf, &tbuf, version)) + rcu_read_unlock(); + + if (copy_shmid_to_user(buf, &tbuf, version)) err = -EFAULT; else err = result; @@ -924,7 +929,7 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid, } out_unlock: - shm_unlock(shp); + rcu_read_unlock(); out: return err; } -- cgit v1.2.3 From 2caacaa82a51b78fc0c800e206473874094287ed Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:21 -0700 Subject: ipc,shm: shorten critical region for shmctl With the *_INFO, *_STAT, IPC_RMID and IPC_SET commands already optimized, deal with the remaining SHM_LOCK and SHM_UNLOCK commands. Take the shm_perm lock after doing the initial auditing and security checks. The rest of the logic remains unchanged. Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) (limited to 'ipc') diff --git a/ipc/shm.c b/ipc/shm.c index a493639550d9..8ec381085dec 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -940,10 +940,8 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) int err, version; struct ipc_namespace *ns; - if (cmd < 0 || shmid < 0) { - err = -EINVAL; - goto out; - } + if (cmd < 0 || shmid < 0) + return -EINVAL; version = ipc_parse_version(&cmd); ns = current->nsproxy->ipc_ns; @@ -954,36 +952,40 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) case SHM_STAT: case IPC_STAT: return shmctl_nolock(ns, shmid, cmd, version, buf); + case IPC_RMID: + case IPC_SET: + return shmctl_down(ns, shmid, cmd, buf, version); case SHM_LOCK: case SHM_UNLOCK: { struct file *shm_file; - shp = shm_lock_check(ns, shmid); + rcu_read_lock(); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock1; } audit_ipc_obj(&(shp->shm_perm)); + err = security_shm_shmctl(shp, cmd); + if (err) + goto out_unlock1; + ipc_lock_object(&shp->shm_perm); if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { kuid_t euid = current_euid(); err = -EPERM; if (!uid_eq(euid, shp->shm_perm.uid) && !uid_eq(euid, shp->shm_perm.cuid)) - goto out_unlock; + goto out_unlock0; if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) - goto out_unlock; + goto out_unlock0; } - err = security_shm_shmctl(shp, cmd); - if (err) - goto out_unlock; - shm_file = shp->shm_file; if (is_file_hugepages(shm_file)) - goto out_unlock; + goto out_unlock0; if (cmd == SHM_LOCK) { struct user_struct *user = current_user(); @@ -992,32 +994,31 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) shp->shm_perm.mode |= SHM_LOCKED; shp->mlock_user = user; } - goto out_unlock; + goto out_unlock0; } /* SHM_UNLOCK */ if (!(shp->shm_perm.mode & SHM_LOCKED)) - goto out_unlock; + goto out_unlock0; shmem_lock(shm_file, 0, shp->mlock_user); shp->shm_perm.mode &= ~SHM_LOCKED; shp->mlock_user = NULL; get_file(shm_file); - shm_unlock(shp); + ipc_unlock_object(&shp->shm_perm); + rcu_read_unlock(); shmem_unlock_mapping(shm_file->f_mapping); + fput(shm_file); - goto out; - } - case IPC_RMID: - case IPC_SET: - err = shmctl_down(ns, shmid, cmd, buf, version); return err; + } default: return -EINVAL; } -out_unlock: - shm_unlock(shp); -out: +out_unlock0: + ipc_unlock_object(&shp->shm_perm); +out_unlock1: + rcu_read_unlock(); return err; } -- cgit v1.2.3 From f42569b1388b1408b574a5e93a23a663647d4181 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:22 -0700 Subject: ipc,shm: cleanup do_shmat pasta Clean up some of the messy do_shmat() spaghetti code, getting rid of out_free and out_put_dentry labels. This makes shortening the critical region of this function in the next patch a little easier to do and read. Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'ipc') diff --git a/ipc/shm.c b/ipc/shm.c index 8ec381085dec..115dccebc63e 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1108,16 +1108,21 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, err = -ENOMEM; sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); - if (!sfd) - goto out_put_dentry; + if (!sfd) { + path_put(&path); + goto out_nattch; + } file = alloc_file(&path, f_mode, is_file_hugepages(shp->shm_file) ? &shm_file_operations_huge : &shm_file_operations); err = PTR_ERR(file); - if (IS_ERR(file)) - goto out_free; + if (IS_ERR(file)) { + kfree(sfd); + path_put(&path); + goto out_nattch; + } file->private_data = sfd; file->f_mapping = shp->shm_file->f_mapping; @@ -1143,7 +1148,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, addr > current->mm->start_stack - size - PAGE_SIZE * 5) goto invalid; } - + addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); *raddr = addr; err = 0; @@ -1167,19 +1172,12 @@ out_nattch: else shm_unlock(shp); up_write(&shm_ids(ns).rw_mutex); - -out: return err; out_unlock: shm_unlock(shp); - goto out; - -out_free: - kfree(sfd); -out_put_dentry: - path_put(&path); - goto out_nattch; +out: + return err; } SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg) -- cgit v1.2.3 From c2c737a0461e61a34676bd0bd1bc1a70a1b4e396 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:23 -0700 Subject: ipc,shm: shorten critical region for shmat Similar to other system calls, acquire the kern_ipc_perm lock after doing the initial permission and security checks. [sasha.levin@oracle.com: dont leave do_shmat with rcu lock held] Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'ipc') diff --git a/ipc/shm.c b/ipc/shm.c index 115dccebc63e..28d19f4ece4b 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -19,6 +19,9 @@ * namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov + * + * Better ipc lock (kern_ipc_perm.lock) handling + * Davidlohr Bueso , June 2013. */ #include @@ -1086,10 +1089,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, * additional creator id... */ ns = current->nsproxy->ipc_ns; - shp = shm_lock_check(ns, shmid); + rcu_read_lock(); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } err = -EACCES; @@ -1100,11 +1104,13 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, if (err) goto out_unlock; + ipc_lock_object(&shp->shm_perm); path = shp->shm_file->f_path; path_get(&path); shp->shm_nattch++; size = i_size_read(path.dentry->d_inode); - shm_unlock(shp); + ipc_unlock_object(&shp->shm_perm); + rcu_read_unlock(); err = -ENOMEM; sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); @@ -1175,7 +1181,7 @@ out_nattch: return err; out_unlock: - shm_unlock(shp); + rcu_read_unlock(); out: return err; } -- cgit v1.2.3 From d9a605e40b1376eb02b067d7690580255a0df68f Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:24 -0700 Subject: ipc: rename ids->rw_mutex Since in some situations the lock can be shared for readers, we shouldn't be calling it a mutex, rename it to rwsem. Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 20 ++++++++++---------- ipc/namespace.c | 4 ++-- ipc/sem.c | 24 ++++++++++++------------ ipc/shm.c | 56 ++++++++++++++++++++++++++++---------------------------- ipc/util.c | 28 ++++++++++++++-------------- ipc/util.h | 4 ++-- 6 files changed, 68 insertions(+), 68 deletions(-) (limited to 'ipc') diff --git a/ipc/msg.c b/ipc/msg.c index b65fdf1a09dd..8203e71bcfbc 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -172,7 +172,7 @@ static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s) * @ns: namespace * @params: ptr to the structure that contains the key and msgflg * - * Called with msg_ids.rw_mutex held (writer) + * Called with msg_ids.rwsem held (writer) */ static int newque(struct ipc_namespace *ns, struct ipc_params *params) { @@ -259,8 +259,8 @@ static void expunge_all(struct msg_queue *msq, int res) * removes the message queue from message queue ID IDR, and cleans up all the * messages associated with this queue. * - * msg_ids.rw_mutex (writer) and the spinlock for this message queue are held - * before freeque() is called. msg_ids.rw_mutex remains locked on exit. + * msg_ids.rwsem (writer) and the spinlock for this message queue are held + * before freeque() is called. msg_ids.rwsem remains locked on exit. */ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { @@ -282,7 +282,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) } /* - * Called with msg_ids.rw_mutex and ipcp locked. + * Called with msg_ids.rwsem and ipcp locked. */ static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg) { @@ -386,9 +386,9 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) } /* - * This function handles some msgctl commands which require the rw_mutex + * This function handles some msgctl commands which require the rwsem * to be held in write mode. - * NOTE: no locks must be held, the rw_mutex is taken inside this function. + * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, struct msqid_ds __user *buf, int version) @@ -403,7 +403,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, return -EFAULT; } - down_write(&msg_ids(ns).rw_mutex); + down_write(&msg_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd, @@ -459,7 +459,7 @@ out_unlock0: out_unlock1: rcu_read_unlock(); out_up: - up_write(&msg_ids(ns).rw_mutex); + up_write(&msg_ids(ns).rwsem); return err; } @@ -494,7 +494,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, msginfo.msgmnb = ns->msg_ctlmnb; msginfo.msgssz = MSGSSZ; msginfo.msgseg = MSGSEG; - down_read(&msg_ids(ns).rw_mutex); + down_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) { msginfo.msgpool = msg_ids(ns).in_use; msginfo.msgmap = atomic_read(&ns->msg_hdrs); @@ -505,7 +505,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, msginfo.msgtql = MSGTQL; } max_id = ipc_get_maxid(&msg_ids(ns)); - up_read(&msg_ids(ns).rw_mutex); + up_read(&msg_ids(ns).rwsem); if (copy_to_user(buf, &msginfo, sizeof(struct msginfo))) return -EFAULT; return (max_id < 0) ? 0 : max_id; diff --git a/ipc/namespace.c b/ipc/namespace.c index 4be6581d3b7f..d43d9384bb2d 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -81,7 +81,7 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, int next_id; int total, in_use; - down_write(&ids->rw_mutex); + down_write(&ids->rwsem); in_use = ids->in_use; @@ -93,7 +93,7 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, free(ns, perm); total++; } - up_write(&ids->rw_mutex); + up_write(&ids->rwsem); } static void free_ipc_ns(struct ipc_namespace *ns) diff --git a/ipc/sem.c b/ipc/sem.c index 41088899783d..69b6a21f3844 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -322,7 +322,7 @@ static inline void sem_unlock(struct sem_array *sma, int locknum) } /* - * sem_lock_(check_) routines are called in the paths where the rw_mutex + * sem_lock_(check_) routines are called in the paths where the rwsem * is not held. * * The caller holds the RCU read lock. @@ -426,7 +426,7 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) * @ns: namespace * @params: ptr to the structure that contains key, semflg and nsems * - * Called with sem_ids.rw_mutex held (as a writer) + * Called with sem_ids.rwsem held (as a writer) */ static int newary(struct ipc_namespace *ns, struct ipc_params *params) @@ -492,7 +492,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) /* - * Called with sem_ids.rw_mutex and ipcp locked. + * Called with sem_ids.rwsem and ipcp locked. */ static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg) { @@ -503,7 +503,7 @@ static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg) } /* - * Called with sem_ids.rw_mutex and ipcp locked. + * Called with sem_ids.rwsem and ipcp locked. */ static inline int sem_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) @@ -994,8 +994,8 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum) return semzcnt; } -/* Free a semaphore set. freeary() is called with sem_ids.rw_mutex locked - * as a writer and the spinlock for this semaphore set hold. sem_ids.rw_mutex +/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked + * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem * remains locked on exit. */ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) @@ -1116,7 +1116,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, seminfo.semmnu = SEMMNU; seminfo.semmap = SEMMAP; seminfo.semume = SEMUME; - down_read(&sem_ids(ns).rw_mutex); + down_read(&sem_ids(ns).rwsem); if (cmd == SEM_INFO) { seminfo.semusz = sem_ids(ns).in_use; seminfo.semaem = ns->used_sems; @@ -1125,7 +1125,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, seminfo.semaem = SEMAEM; } max_id = ipc_get_maxid(&sem_ids(ns)); - up_read(&sem_ids(ns).rw_mutex); + up_read(&sem_ids(ns).rwsem); if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) return -EFAULT; return (max_id < 0) ? 0: max_id; @@ -1431,9 +1431,9 @@ copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version) } /* - * This function handles some semctl commands which require the rw_mutex + * This function handles some semctl commands which require the rwsem * to be held in write mode. - * NOTE: no locks must be held, the rw_mutex is taken inside this function. + * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int semctl_down(struct ipc_namespace *ns, int semid, int cmd, int version, void __user *p) @@ -1448,7 +1448,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid, return -EFAULT; } - down_write(&sem_ids(ns).rw_mutex); + down_write(&sem_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd, @@ -1487,7 +1487,7 @@ out_unlock0: out_unlock1: rcu_read_unlock(); out_up: - up_write(&sem_ids(ns).rw_mutex); + up_write(&sem_ids(ns).rwsem); return err; } diff --git a/ipc/shm.c b/ipc/shm.c index 28d19f4ece4b..cb2cedaa8808 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -83,8 +83,8 @@ void shm_init_ns(struct ipc_namespace *ns) } /* - * Called with shm_ids.rw_mutex (writer) and the shp structure locked. - * Only shm_ids.rw_mutex remains locked on exit. + * Called with shm_ids.rwsem (writer) and the shp structure locked. + * Only shm_ids.rwsem remains locked on exit. */ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { @@ -148,7 +148,7 @@ static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace } /* - * shm_lock_(check_) routines are called in the paths where the rw_mutex + * shm_lock_(check_) routines are called in the paths where the rwsem * is not necessarily held. */ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) @@ -205,7 +205,7 @@ static void shm_open(struct vm_area_struct *vma) * @ns: namespace * @shp: struct to free * - * It has to be called with shp and shm_ids.rw_mutex (writer) locked, + * It has to be called with shp and shm_ids.rwsem (writer) locked, * but returns with shp unlocked and freed. */ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) @@ -253,7 +253,7 @@ static void shm_close(struct vm_area_struct *vma) struct shmid_kernel *shp; struct ipc_namespace *ns = sfd->ns; - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); /* remove from the list of attaches of the shm segment */ shp = shm_lock(ns, sfd->id); BUG_ON(IS_ERR(shp)); @@ -264,10 +264,10 @@ static void shm_close(struct vm_area_struct *vma) shm_destroy(ns, shp); else shm_unlock(shp); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); } -/* Called with ns->shm_ids(ns).rw_mutex locked */ +/* Called with ns->shm_ids(ns).rwsem locked */ static int shm_try_destroy_current(int id, void *p, void *data) { struct ipc_namespace *ns = data; @@ -298,7 +298,7 @@ static int shm_try_destroy_current(int id, void *p, void *data) return 0; } -/* Called with ns->shm_ids(ns).rw_mutex locked */ +/* Called with ns->shm_ids(ns).rwsem locked */ static int shm_try_destroy_orphaned(int id, void *p, void *data) { struct ipc_namespace *ns = data; @@ -309,7 +309,7 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) * We want to destroy segments without users and with already * exit'ed originating process. * - * As shp->* are changed under rw_mutex, it's safe to skip shp locking. + * As shp->* are changed under rwsem, it's safe to skip shp locking. */ if (shp->shm_creator != NULL) return 0; @@ -323,10 +323,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) void shm_destroy_orphaned(struct ipc_namespace *ns) { - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); if (shm_ids(ns).in_use) idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); } @@ -338,10 +338,10 @@ void exit_shm(struct task_struct *task) return; /* Destroy all already created segments, but not mapped yet */ - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); if (shm_ids(ns).in_use) idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); } static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -475,7 +475,7 @@ static const struct vm_operations_struct shm_vm_ops = { * @ns: namespace * @params: ptr to the structure that contains key, size and shmflg * - * Called with shm_ids.rw_mutex held as a writer. + * Called with shm_ids.rwsem held as a writer. */ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) @@ -583,7 +583,7 @@ no_file: } /* - * Called with shm_ids.rw_mutex and ipcp locked. + * Called with shm_ids.rwsem and ipcp locked. */ static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) { @@ -594,7 +594,7 @@ static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) } /* - * Called with shm_ids.rw_mutex and ipcp locked. + * Called with shm_ids.rwsem and ipcp locked. */ static inline int shm_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) @@ -707,7 +707,7 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf /* * Calculate and add used RSS and swap pages of a shm. - * Called with shm_ids.rw_mutex held as a reader + * Called with shm_ids.rwsem held as a reader */ static void shm_add_rss_swap(struct shmid_kernel *shp, unsigned long *rss_add, unsigned long *swp_add) @@ -734,7 +734,7 @@ static void shm_add_rss_swap(struct shmid_kernel *shp, } /* - * Called with shm_ids.rw_mutex held as a reader + * Called with shm_ids.rwsem held as a reader */ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, unsigned long *swp) @@ -763,9 +763,9 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, } /* - * This function handles some shmctl commands which require the rw_mutex + * This function handles some shmctl commands which require the rwsem * to be held in write mode. - * NOTE: no locks must be held, the rw_mutex is taken inside this function. + * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, struct shmid_ds __user *buf, int version) @@ -780,7 +780,7 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, return -EFAULT; } - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd, @@ -819,7 +819,7 @@ out_unlock0: out_unlock1: rcu_read_unlock(); out_up: - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); return err; } @@ -850,9 +850,9 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid, if(copy_shminfo_to_user (buf, &shminfo, version)) return -EFAULT; - down_read(&shm_ids(ns).rw_mutex); + down_read(&shm_ids(ns).rwsem); err = ipc_get_maxid(&shm_ids(ns)); - up_read(&shm_ids(ns).rw_mutex); + up_read(&shm_ids(ns).rwsem); if(err<0) err = 0; @@ -863,14 +863,14 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid, struct shm_info shm_info; memset(&shm_info, 0, sizeof(shm_info)); - down_read(&shm_ids(ns).rw_mutex); + down_read(&shm_ids(ns).rwsem); shm_info.used_ids = shm_ids(ns).in_use; shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); shm_info.shm_tot = ns->shm_tot; shm_info.swap_attempts = 0; shm_info.swap_successes = 0; err = ipc_get_maxid(&shm_ids(ns)); - up_read(&shm_ids(ns).rw_mutex); + up_read(&shm_ids(ns).rwsem); if (copy_to_user(buf, &shm_info, sizeof(shm_info))) { err = -EFAULT; goto out; @@ -1169,7 +1169,7 @@ out_fput: fput(file); out_nattch: - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); shp = shm_lock(ns, shmid); BUG_ON(IS_ERR(shp)); shp->shm_nattch--; @@ -1177,7 +1177,7 @@ out_nattch: shm_destroy(ns, shp); else shm_unlock(shp); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); return err; out_unlock: diff --git a/ipc/util.c b/ipc/util.c index 2c8a93b380ba..9a1d779a20e2 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -119,7 +119,7 @@ __initcall(ipc_init); void ipc_init_ids(struct ipc_ids *ids) { - init_rwsem(&ids->rw_mutex); + init_rwsem(&ids->rwsem); ids->in_use = 0; ids->seq = 0; @@ -174,7 +174,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header, * @ids: Identifier set * @key: The key to find * - * Requires ipc_ids.rw_mutex locked. + * Requires ipc_ids.rwsem locked. * Returns the LOCKED pointer to the ipc structure if found or NULL * if not. * If key is found ipc points to the owning ipc structure @@ -208,7 +208,7 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) * ipc_get_maxid - get the last assigned id * @ids: IPC identifier set * - * Called with ipc_ids.rw_mutex held. + * Called with ipc_ids.rwsem held. */ int ipc_get_maxid(struct ipc_ids *ids) @@ -246,7 +246,7 @@ int ipc_get_maxid(struct ipc_ids *ids) * is returned. The 'new' entry is returned in a locked state on success. * On failure the entry is not locked and a negative err-code is returned. * - * Called with writer ipc_ids.rw_mutex held. + * Called with writer ipc_ids.rwsem held. */ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) { @@ -312,9 +312,9 @@ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids, { int err; - down_write(&ids->rw_mutex); + down_write(&ids->rwsem); err = ops->getnew(ns, params); - up_write(&ids->rw_mutex); + up_write(&ids->rwsem); return err; } @@ -331,7 +331,7 @@ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids, * * On success, the IPC id is returned. * - * It is called with ipc_ids.rw_mutex and ipcp->lock held. + * It is called with ipc_ids.rwsem and ipcp->lock held. */ static int ipc_check_perms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, @@ -376,7 +376,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, * Take the lock as a writer since we are potentially going to add * a new entry + read locks are not "upgradable" */ - down_write(&ids->rw_mutex); + down_write(&ids->rwsem); ipcp = ipc_findkey(ids, params->key); if (ipcp == NULL) { /* key not used */ @@ -402,7 +402,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, } ipc_unlock(ipcp); } - up_write(&ids->rw_mutex); + up_write(&ids->rwsem); return err; } @@ -413,7 +413,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, * @ids: IPC identifier set * @ipcp: ipc perm structure containing the identifier to remove * - * ipc_ids.rw_mutex (as a writer) and the spinlock for this ID are held + * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ @@ -621,7 +621,7 @@ struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id) } /** - * ipc_lock - Lock an ipc structure without rw_mutex held + * ipc_lock - Lock an ipc structure without rwsem held * @ids: IPC identifier set * @id: ipc id to look for * @@ -748,7 +748,7 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) * - performs some audit and permission check, depending on the given cmd * - returns a pointer to the ipc object or otherwise, the corresponding error. * - * Call holding the both the rw_mutex and the rcu read lock. + * Call holding the both the rwsem and the rcu read lock. */ struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, @@ -868,7 +868,7 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) * Take the lock - this will be released by the corresponding * call to stop(). */ - down_read(&ids->rw_mutex); + down_read(&ids->rwsem); /* pos < 0 is invalid */ if (*pos < 0) @@ -895,7 +895,7 @@ static void sysvipc_proc_stop(struct seq_file *s, void *it) ids = &iter->ns->ids[iface->ids]; /* Release the lock we took in start() */ - up_read(&ids->rw_mutex); + up_read(&ids->rwsem); } static int sysvipc_proc_show(struct seq_file *s, void *it) diff --git a/ipc/util.h b/ipc/util.h index 41a6c4d26399..0a362ffca972 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -94,10 +94,10 @@ void __init ipc_init_proc_interface(const char *path, const char *header, #define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) #define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER) -/* must be called with ids->rw_mutex acquired for writing */ +/* must be called with ids->rwsem acquired for writing */ int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); -/* must be called with ids->rw_mutex acquired for reading */ +/* must be called with ids->rwsem acquired for reading */ int ipc_get_maxid(struct ipc_ids *); /* must be called with both locks acquired. */ -- cgit v1.2.3 From 4718787d1f626f45ddb239912bc07266b9880044 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:25 -0700 Subject: ipc,msg: drop msg_unlock There is only one user left, drop this function and just call ipc_unlock_object() and rcu_read_unlock(). Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'ipc') diff --git a/ipc/msg.c b/ipc/msg.c index 8203e71bcfbc..b0d541d42677 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -70,8 +70,6 @@ struct msg_sender { #define msg_ids(ns) ((ns)->ids[IPC_MSG_IDS]) -#define msg_unlock(msq) ipc_unlock(&(msq)->q_perm) - static void freeque(struct ipc_namespace *, struct kern_ipc_perm *); static int newque(struct ipc_namespace *, struct ipc_params *); #ifdef CONFIG_PROC_FS @@ -270,7 +268,8 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) expunge_all(msq, -EIDRM); ss_wakeup(&msq->q_senders, 1); msg_rmid(ns, msq); - msg_unlock(msq); + ipc_unlock_object(&msq->q_perm); + rcu_read_unlock(); list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { atomic_dec(&ns->msg_hdrs); -- cgit v1.2.3 From 05603c44a7627793219b0bd9a7b236099dc9cd9d Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:26 -0700 Subject: ipc: document general ipc locking scheme As suggested by Andrew, add a generic initial locking scheme used throughout all sysv ipc mechanisms. Documenting the ids rwsem, how rcu can be enough to do the initial checks and when to actually acquire the kern_ipc_perm.lock spinlock. I found that adding it to util.c was generic enough. Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'ipc') diff --git a/ipc/util.c b/ipc/util.c index 9a1d779a20e2..1ddadcf9a2ab 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -15,6 +15,14 @@ * Jun 2006 - namespaces ssupport * OpenVZ, SWsoft Inc. * Pavel Emelianov + * + * General sysv ipc locking scheme: + * when doing ipc id lookups, take the ids->rwsem + * rcu_read_lock() + * obtain the ipc object (kern_ipc_perm) + * perform security, capabilities, auditing and permission checks, etc. + * acquire the ipc lock (kern_ipc_perm.lock) throught ipc_lock_object() + * perform data updates (ie: SET, RMID, LOCK/UNLOCK commands) */ #include -- cgit v1.2.3 From 530fcd16d87cd2417c472a581ba5a1e501556c86 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:28 -0700 Subject: ipc, shm: guard against non-existant vma in shmdt(2) When !CONFIG_MMU there's a chance we can derefence a NULL pointer when the VM area isn't found - check the return value of find_vma(). Also, remove the redundant -EINVAL return: retval is set to the proper return code and *only* changed to 0, when we actually unmap the segments. Signed-off-by: Davidlohr Bueso Cc: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'ipc') diff --git a/ipc/shm.c b/ipc/shm.c index cb2cedaa8808..a0ed957cefc9 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1288,8 +1288,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) #else /* CONFIG_MMU */ /* under NOMMU conditions, the exact address to be destroyed must be * given */ - retval = -EINVAL; - if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { + if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); retval = 0; } -- cgit v1.2.3 From 32a2750010981216fb788c5190fb0e646abfab30 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:29 -0700 Subject: ipc: drop ipc_lock_by_ptr After previous cleanups and optimizations, this function is no longer heavily used and we don't have a good reason to keep it. Update the few remaining callers and get rid of it. Signed-off-by: Davidlohr Bueso Cc: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/namespace.c | 3 ++- ipc/util.c | 6 ++++-- ipc/util.h | 6 ------ 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'ipc') diff --git a/ipc/namespace.c b/ipc/namespace.c index d43d9384bb2d..59451c1e214d 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -89,7 +89,8 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, perm = idr_find(&ids->ipcs_idr, next_id); if (perm == NULL) continue; - ipc_lock_by_ptr(perm); + rcu_read_lock(); + ipc_lock_object(perm); free(ns, perm); total++; } diff --git a/ipc/util.c b/ipc/util.c index 1ddadcf9a2ab..9f6aa30d2e0f 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -205,7 +205,8 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) continue; } - ipc_lock_by_ptr(ipc); + rcu_read_lock(); + ipc_lock_object(ipc); return ipc; } @@ -838,7 +839,8 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, ipc = idr_find(&ids->ipcs_idr, pos); if (ipc != NULL) { *new_pos = pos + 1; - ipc_lock_by_ptr(ipc); + rcu_read_lock(); + ipc_lock_object(ipc); return ipc; } } diff --git a/ipc/util.h b/ipc/util.h index 0a362ffca972..14b0a2adba08 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -171,12 +171,6 @@ static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm) assert_spin_locked(&perm->lock); } -static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm) -{ - rcu_read_lock(); - ipc_lock_object(perm); -} - static inline void ipc_unlock(struct kern_ipc_perm *perm) { ipc_unlock_object(perm); -- cgit v1.2.3 From 7a25dd9e042b2b94202a67e5551112f4ac87285a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:30 -0700 Subject: ipc, shm: drop shm_lock_check This function was replaced by a the lockless shm_obtain_object_check(), and no longer has any users. Signed-off-by: Davidlohr Bueso Cc: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'ipc') diff --git a/ipc/shm.c b/ipc/shm.c index a0ed957cefc9..2821cdf93adb 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -167,17 +167,6 @@ static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) ipc_lock_object(&ipcp->shm_perm); } -static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, - int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id); - - if (IS_ERR(ipcp)) - return (struct shmid_kernel *)ipcp; - - return container_of(ipcp, struct shmid_kernel, shm_perm); -} - static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) { ipc_rmid(&shm_ids(ns), &s->shm_perm); -- cgit v1.2.3 From 20b8875abcf2daa1dda5cf70bd6369df5e85d4c1 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:31 -0700 Subject: ipc: drop ipc_lock_check No remaining users, we now use ipc_obtain_object_check(). Signed-off-by: Davidlohr Bueso Cc: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.c | 16 ---------------- ipc/util.h | 1 - 2 files changed, 17 deletions(-) (limited to 'ipc') diff --git a/ipc/util.c b/ipc/util.c index 9f6aa30d2e0f..e829da9ed01f 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -686,22 +686,6 @@ out: return out; } -struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id) -{ - struct kern_ipc_perm *out; - - out = ipc_lock(ids, id); - if (IS_ERR(out)) - return out; - - if (ipc_checkid(out, id)) { - ipc_unlock(out); - return ERR_PTR(-EIDRM); - } - - return out; -} - /** * ipcget - Common sys_*get() code * @ns : namsepace diff --git a/ipc/util.h b/ipc/util.h index 14b0a2adba08..c5f3338ba1fa 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -177,7 +177,6 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm) rcu_read_unlock(); } -struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id); struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id); int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, struct ipc_ops *ops, struct ipc_params *params); -- cgit v1.2.3 From 53dad6d3a8e5ac1af8bacc6ac2134ae1a8b085f1 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 23 Sep 2013 17:04:45 -0700 Subject: ipc: fix race with LSMs Currently, IPC mechanisms do security and auditing related checks under RCU. However, since security modules can free the security structure, for example, through selinux_[sem,msg_queue,shm]_free_security(), we can race if the structure is freed before other tasks are done with it, creating a use-after-free condition. Manfred illustrates this nicely, for instance with shared mem and selinux: -> do_shmat calls rcu_read_lock() -> do_shmat calls shm_object_check(). Checks that the object is still valid - but doesn't acquire any locks. Then it returns. -> do_shmat calls security_shm_shmat (e.g. selinux_shm_shmat) -> selinux_shm_shmat calls ipc_has_perm() -> ipc_has_perm accesses ipc_perms->security shm_close() -> shm_close acquires rw_mutex & shm_lock -> shm_close calls shm_destroy -> shm_destroy calls security_shm_free (e.g. selinux_shm_free_security) -> selinux_shm_free_security calls ipc_free_security(&shp->shm_perm) -> ipc_free_security calls kfree(ipc_perms->security) This patch delays the freeing of the security structures after all RCU readers are done. Furthermore it aligns the security life cycle with that of the rest of IPC - freeing them based on the reference counter. For situations where we need not free security, the current behavior is kept. Linus states: "... the old behavior was suspect for another reason too: having the security blob go away from under a user sounds like it could cause various other problems anyway, so I think the old code was at least _prone_ to bugs even if it didn't have catastrophic behavior." I have tested this patch with IPC testcases from LTP on both my quad-core laptop and on a 64 core NUMA server. In both cases selinux is enabled, and tests pass for both voluntary and forced preemption models. While the mentioned races are theoretical (at least no one as reported them), I wanted to make sure that this new logic doesn't break anything we weren't aware of. Suggested-by: Linus Torvalds Signed-off-by: Davidlohr Bueso Acked-by: Manfred Spraul Signed-off-by: Linus Torvalds --- ipc/msg.c | 19 +++++++++++++------ ipc/sem.c | 34 ++++++++++++++++++---------------- ipc/shm.c | 17 ++++++++++++----- ipc/util.c | 32 ++++++++++++-------------------- ipc/util.h | 10 +++++++++- 5 files changed, 64 insertions(+), 48 deletions(-) (limited to 'ipc') diff --git a/ipc/msg.c b/ipc/msg.c index b0d541d42677..9e4310c546ae 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -165,6 +165,15 @@ static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s) ipc_rmid(&msg_ids(ns), &s->q_perm); } +static void msg_rcu_free(struct rcu_head *head) +{ + struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); + struct msg_queue *msq = ipc_rcu_to_struct(p); + + security_msg_queue_free(msq); + ipc_rcu_free(head); +} + /** * newque - Create a new msg queue * @ns: namespace @@ -189,15 +198,14 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) msq->q_perm.security = NULL; retval = security_msg_queue_alloc(msq); if (retval) { - ipc_rcu_putref(msq); + ipc_rcu_putref(msq, ipc_rcu_free); return retval; } /* ipc_addid() locks msq upon success. */ id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); if (id < 0) { - security_msg_queue_free(msq); - ipc_rcu_putref(msq); + ipc_rcu_putref(msq, msg_rcu_free); return id; } @@ -276,8 +284,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) free_msg(msg); } atomic_sub(msq->q_cbytes, &ns->msg_bytes); - security_msg_queue_free(msq); - ipc_rcu_putref(msq); + ipc_rcu_putref(msq, msg_rcu_free); } /* @@ -717,7 +724,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, rcu_read_lock(); ipc_lock_object(&msq->q_perm); - ipc_rcu_putref(msq); + ipc_rcu_putref(msq, ipc_rcu_free); if (msq->q_perm.deleted) { err = -EIDRM; goto out_unlock0; diff --git a/ipc/sem.c b/ipc/sem.c index 69b6a21f3844..19c8b980d1fe 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -243,6 +243,15 @@ static void merge_queues(struct sem_array *sma) } } +static void sem_rcu_free(struct rcu_head *head) +{ + struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); + struct sem_array *sma = ipc_rcu_to_struct(p); + + security_sem_free(sma); + ipc_rcu_free(head); +} + /* * If the request contains only one semaphore operation, and there are * no complex transactions pending, lock only the semaphore involved. @@ -374,12 +383,7 @@ static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns static inline void sem_lock_and_putref(struct sem_array *sma) { sem_lock(sma, NULL, -1); - ipc_rcu_putref(sma); -} - -static inline void sem_putref(struct sem_array *sma) -{ - ipc_rcu_putref(sma); + ipc_rcu_putref(sma, ipc_rcu_free); } static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) @@ -458,14 +462,13 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) sma->sem_perm.security = NULL; retval = security_sem_alloc(sma); if (retval) { - ipc_rcu_putref(sma); + ipc_rcu_putref(sma, ipc_rcu_free); return retval; } id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); if (id < 0) { - security_sem_free(sma); - ipc_rcu_putref(sma); + ipc_rcu_putref(sma, sem_rcu_free); return id; } ns->used_sems += nsems; @@ -1047,8 +1050,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) wake_up_sem_queue_do(&tasks); ns->used_sems -= sma->sem_nsems; - security_sem_free(sma); - ipc_rcu_putref(sma); + ipc_rcu_putref(sma, sem_rcu_free); } static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version) @@ -1292,7 +1294,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, rcu_read_unlock(); sem_io = ipc_alloc(sizeof(ushort)*nsems); if(sem_io == NULL) { - sem_putref(sma); + ipc_rcu_putref(sma, ipc_rcu_free); return -ENOMEM; } @@ -1328,20 +1330,20 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, if(nsems > SEMMSL_FAST) { sem_io = ipc_alloc(sizeof(ushort)*nsems); if(sem_io == NULL) { - sem_putref(sma); + ipc_rcu_putref(sma, ipc_rcu_free); return -ENOMEM; } } if (copy_from_user (sem_io, p, nsems*sizeof(ushort))) { - sem_putref(sma); + ipc_rcu_putref(sma, ipc_rcu_free); err = -EFAULT; goto out_free; } for (i = 0; i < nsems; i++) { if (sem_io[i] > SEMVMX) { - sem_putref(sma); + ipc_rcu_putref(sma, ipc_rcu_free); err = -ERANGE; goto out_free; } @@ -1629,7 +1631,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) /* step 2: allocate new undo structure */ new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); if (!new) { - sem_putref(sma); + ipc_rcu_putref(sma, ipc_rcu_free); return ERR_PTR(-ENOMEM); } diff --git a/ipc/shm.c b/ipc/shm.c index 2821cdf93adb..d69739610fd4 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -167,6 +167,15 @@ static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) ipc_lock_object(&ipcp->shm_perm); } +static void shm_rcu_free(struct rcu_head *head) +{ + struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); + struct shmid_kernel *shp = ipc_rcu_to_struct(p); + + security_shm_free(shp); + ipc_rcu_free(head); +} + static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) { ipc_rmid(&shm_ids(ns), &s->shm_perm); @@ -208,8 +217,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) user_shm_unlock(file_inode(shp->shm_file)->i_size, shp->mlock_user); fput (shp->shm_file); - security_shm_free(shp); - ipc_rcu_putref(shp); + ipc_rcu_putref(shp, shm_rcu_free); } /* @@ -497,7 +505,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_perm.security = NULL; error = security_shm_alloc(shp); if (error) { - ipc_rcu_putref(shp); + ipc_rcu_putref(shp, ipc_rcu_free); return error; } @@ -566,8 +574,7 @@ no_id: user_shm_unlock(size, shp->mlock_user); fput(file); no_file: - security_shm_free(shp); - ipc_rcu_putref(shp); + ipc_rcu_putref(shp, shm_rcu_free); return error; } diff --git a/ipc/util.c b/ipc/util.c index e829da9ed01f..fdb8ae740775 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -474,11 +474,6 @@ void ipc_free(void* ptr, int size) kfree(ptr); } -struct ipc_rcu { - struct rcu_head rcu; - atomic_t refcount; -} ____cacheline_aligned_in_smp; - /** * ipc_rcu_alloc - allocate ipc and rcu space * @size: size desired @@ -505,27 +500,24 @@ int ipc_rcu_getref(void *ptr) return atomic_inc_not_zero(&p->refcount); } -/** - * ipc_schedule_free - free ipc + rcu space - * @head: RCU callback structure for queued work - */ -static void ipc_schedule_free(struct rcu_head *head) -{ - vfree(container_of(head, struct ipc_rcu, rcu)); -} - -void ipc_rcu_putref(void *ptr) +void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head)) { struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1; if (!atomic_dec_and_test(&p->refcount)) return; - if (is_vmalloc_addr(ptr)) { - call_rcu(&p->rcu, ipc_schedule_free); - } else { - kfree_rcu(p, rcu); - } + call_rcu(&p->rcu, func); +} + +void ipc_rcu_free(struct rcu_head *head) +{ + struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); + + if (is_vmalloc_addr(p)) + vfree(p); + else + kfree(p); } /** diff --git a/ipc/util.h b/ipc/util.h index c5f3338ba1fa..f2f5036f2eed 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -47,6 +47,13 @@ static inline void msg_exit_ns(struct ipc_namespace *ns) { } static inline void shm_exit_ns(struct ipc_namespace *ns) { } #endif +struct ipc_rcu { + struct rcu_head rcu; + atomic_t refcount; +} ____cacheline_aligned_in_smp; + +#define ipc_rcu_to_struct(p) ((void *)(p+1)) + /* * Structure that holds the parameters needed by the ipc operations * (see after) @@ -120,7 +127,8 @@ void ipc_free(void* ptr, int size); */ void* ipc_rcu_alloc(int size); int ipc_rcu_getref(void *ptr); -void ipc_rcu_putref(void *ptr); +void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head)); +void ipc_rcu_free(struct rcu_head *head); struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id); -- cgit v1.2.3 From 5e9d527591421ccdb16acb8c23662231135d8686 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 30 Sep 2013 13:45:04 -0700 Subject: ipc/sem.c: fix race in sem_lock() The exclusion of complex operations in sem_lock() is insufficient: after acquiring the per-semaphore lock, a simple op must first check that sem_perm.lock is not locked and only after that test check complex_count. The current code does it the other way around - and that creates a race. Details are below. The patch is a complete rewrite of sem_lock(), based in part on the code from Mike Galbraith. It removes all gotos and all loops and thus the risk of livelocks. I have tested the patch (together with the next one) on my i3 laptop and it didn't cause any problems. The bug is probably also present in 3.10 and 3.11, but for these kernels it might be simpler just to move the test of sma->complex_count after the spin_is_locked() test. Details of the bug: Assume: - sma->complex_count = 0. - Thread 1: semtimedop(complex op that must sleep) - Thread 2: semtimedop(simple op). Pseudo-Trace: Thread 1: sem_lock(): acquire sem_perm.lock Thread 1: sem_lock(): check for ongoing simple ops Nothing ongoing, thread 2 is still before sem_lock(). Thread 1: try_atomic_semop() <<< preempted. Thread 2: sem_lock(): static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, int nsops) { int locknum; again: if (nsops == 1 && !sma->complex_count) { struct sem *sem = sma->sem_base + sops->sem_num; /* Lock just the semaphore we are interested in. */ spin_lock(&sem->lock); /* * If sma->complex_count was set while we were spinning, * we may need to look at things we did not lock here. */ if (unlikely(sma->complex_count)) { spin_unlock(&sem->lock); goto lock_array; } <<<<<<<<< <<< complex_count is still 0. <<< <<< Here it is preempted <<<<<<<<< Thread 1: try_atomic_semop() returns, notices that it must sleep. Thread 1: increases sma->complex_count. Thread 1: drops sem_perm.lock Thread 2: /* * Another process is holding the global lock on the * sem_array; we cannot enter our critical section, * but have to wait for the global lock to be released. */ if (unlikely(spin_is_locked(&sma->sem_perm.lock))) { spin_unlock(&sem->lock); spin_unlock_wait(&sma->sem_perm.lock); goto again; } <<< sem_perm.lock already dropped, thus no "goto again;" locknum = sops->sem_num; Signed-off-by: Manfred Spraul Cc: Mike Galbraith Cc: Rik van Riel Cc: Davidlohr Bueso Cc: [3.10+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 122 +++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 44 deletions(-) (limited to 'ipc') diff --git a/ipc/sem.c b/ipc/sem.c index 19c8b980d1fe..4a92c0447ad6 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -252,71 +252,105 @@ static void sem_rcu_free(struct rcu_head *head) ipc_rcu_free(head); } +/* + * Wait until all currently ongoing simple ops have completed. + * Caller must own sem_perm.lock. + * New simple ops cannot start, because simple ops first check + * that sem_perm.lock is free. + */ +static void sem_wait_array(struct sem_array *sma) +{ + int i; + struct sem *sem; + + for (i = 0; i < sma->sem_nsems; i++) { + sem = sma->sem_base + i; + spin_unlock_wait(&sem->lock); + } +} + /* * If the request contains only one semaphore operation, and there are * no complex transactions pending, lock only the semaphore involved. * Otherwise, lock the entire semaphore array, since we either have * multiple semaphores in our own semops, or we need to look at * semaphores from other pending complex operations. - * - * Carefully guard against sma->complex_count changing between zero - * and non-zero while we are spinning for the lock. The value of - * sma->complex_count cannot change while we are holding the lock, - * so sem_unlock should be fine. - * - * The global lock path checks that all the local locks have been released, - * checking each local lock once. This means that the local lock paths - * cannot start their critical sections while the global lock is held. */ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, int nsops) { - int locknum; - again: - if (nsops == 1 && !sma->complex_count) { - struct sem *sem = sma->sem_base + sops->sem_num; + struct sem *sem; - /* Lock just the semaphore we are interested in. */ - spin_lock(&sem->lock); + if (nsops != 1) { + /* Complex operation - acquire a full lock */ + ipc_lock_object(&sma->sem_perm); - /* - * If sma->complex_count was set while we were spinning, - * we may need to look at things we did not lock here. + /* And wait until all simple ops that are processed + * right now have dropped their locks. */ - if (unlikely(sma->complex_count)) { - spin_unlock(&sem->lock); - goto lock_array; - } + sem_wait_array(sma); + return -1; + } + + /* + * Only one semaphore affected - try to optimize locking. + * The rules are: + * - optimized locking is possible if no complex operation + * is either enqueued or processed right now. + * - The test for enqueued complex ops is simple: + * sma->complex_count != 0 + * - Testing for complex ops that are processed right now is + * a bit more difficult. Complex ops acquire the full lock + * and first wait that the running simple ops have completed. + * (see above) + * Thus: If we own a simple lock and the global lock is free + * and complex_count is now 0, then it will stay 0 and + * thus just locking sem->lock is sufficient. + */ + sem = sma->sem_base + sops->sem_num; + if (sma->complex_count == 0) { /* - * Another process is holding the global lock on the - * sem_array; we cannot enter our critical section, - * but have to wait for the global lock to be released. + * It appears that no complex operation is around. + * Acquire the per-semaphore lock. */ - if (unlikely(spin_is_locked(&sma->sem_perm.lock))) { - spin_unlock(&sem->lock); - spin_unlock_wait(&sma->sem_perm.lock); - goto again; + spin_lock(&sem->lock); + + /* Then check that the global lock is free */ + if (!spin_is_locked(&sma->sem_perm.lock)) { + /* spin_is_locked() is not a memory barrier */ + smp_mb(); + + /* Now repeat the test of complex_count: + * It can't change anymore until we drop sem->lock. + * Thus: if is now 0, then it will stay 0. + */ + if (sma->complex_count == 0) { + /* fast path successful! */ + return sops->sem_num; + } } + spin_unlock(&sem->lock); + } - locknum = sops->sem_num; + /* slow path: acquire the full lock */ + ipc_lock_object(&sma->sem_perm); + + if (sma->complex_count == 0) { + /* False alarm: + * There is no complex operation, thus we can switch + * back to the fast path. + */ + spin_lock(&sem->lock); + ipc_unlock_object(&sma->sem_perm); + return sops->sem_num; } else { - int i; - /* - * Lock the semaphore array, and wait for all of the - * individual semaphore locks to go away. The code - * above ensures no new single-lock holders will enter - * their critical section while the array lock is held. + /* Not a false alarm, thus complete the sequence for a + * full lock. */ - lock_array: - ipc_lock_object(&sma->sem_perm); - for (i = 0; i < sma->sem_nsems; i++) { - struct sem *sem = sma->sem_base + i; - spin_unlock_wait(&sem->lock); - } - locknum = -1; + sem_wait_array(sma); + return -1; } - return locknum; } static inline void sem_unlock(struct sem_array *sma, int locknum) -- cgit v1.2.3 From 6d07b68ce16ae9535955ba2059dedba5309c3ca1 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 30 Sep 2013 13:45:06 -0700 Subject: ipc/sem.c: optimize sem_lock() Operations that need access to the whole array must guarantee that there are no simple operations ongoing. Right now this is achieved by spin_unlock_wait(sem->lock) on all semaphores. If complex_count is nonzero, then this spin_unlock_wait() is not necessary, because it was already performed in the past by the thread that increased complex_count and even though sem_perm.lock was dropped inbetween, no simple operation could have started, because simple operations cannot start when complex_count is non-zero. Signed-off-by: Manfred Spraul Cc: Mike Galbraith Cc: Rik van Riel Reviewed-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'ipc') diff --git a/ipc/sem.c b/ipc/sem.c index 4a92c0447ad6..e20658d76bb5 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -257,12 +257,20 @@ static void sem_rcu_free(struct rcu_head *head) * Caller must own sem_perm.lock. * New simple ops cannot start, because simple ops first check * that sem_perm.lock is free. + * that a) sem_perm.lock is free and b) complex_count is 0. */ static void sem_wait_array(struct sem_array *sma) { int i; struct sem *sem; + if (sma->complex_count) { + /* The thread that increased sma->complex_count waited on + * all sem->lock locks. Thus we don't need to wait again. + */ + return; + } + for (i = 0; i < sma->sem_nsems; i++) { sem = sma->sem_base + i; spin_unlock_wait(&sem->lock); -- cgit v1.2.3 From d8c633766ad88527f25d9f81a5c2f083d78a2b39 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 30 Sep 2013 13:45:07 -0700 Subject: ipc/sem.c: synchronize the proc interface The proc interface is not aware of sem_lock(), it instead calls ipc_lock_object() directly. This means that simple semop() operations can run in parallel with the proc interface. Right now, this is uncritical, because the implementation doesn't do anything that requires a proper synchronization. But it is dangerous and therefore should be fixed. Signed-off-by: Manfred Spraul Cc: Davidlohr Bueso Cc: Mike Galbraith Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'ipc') diff --git a/ipc/sem.c b/ipc/sem.c index e20658d76bb5..cd6a733011a2 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2103,6 +2103,14 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) struct sem_array *sma = it; time_t sem_otime; + /* + * The proc interface isn't aware of sem_lock(), it calls + * ipc_lock_object() directly (in sysvipc_find_ipc). + * In order to stay compatible with sem_lock(), we must wait until + * all simple semop() calls have left their critical regions. + */ + sem_wait_array(sma); + sem_otime = get_semotime(sma); return seq_printf(s, -- cgit v1.2.3 From 0e8c665699e953fa58dc1b0b0d09e5dce7343cc7 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 30 Sep 2013 13:45:25 -0700 Subject: ipc/sem.c: update sem_otime for all operations In commit 0a2b9d4c7967 ("ipc/sem.c: move wake_up_process out of the spinlock section"), the update of semaphore's sem_otime(last semop time) was moved to one central position (do_smart_update). But since do_smart_update() is only called for operations that modify the array, this means that wait-for-zero semops do not update sem_otime anymore. The fix is simple: Non-alter operations must update sem_otime. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Manfred Spraul Reported-by: Jia He Tested-by: Jia He Cc: Davidlohr Bueso Cc: Mike Galbraith Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'ipc') diff --git a/ipc/sem.c b/ipc/sem.c index cd6a733011a2..8c4f59b0204a 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -917,6 +917,24 @@ again: return semop_completed; } +/** + * set_semotime(sma, sops) - set sem_otime + * @sma: semaphore array + * @sops: operations that modified the array, may be NULL + * + * sem_otime is replicated to avoid cache line trashing. + * This function sets one instance to the current time. + */ +static void set_semotime(struct sem_array *sma, struct sembuf *sops) +{ + if (sops == NULL) { + sma->sem_base[0].sem_otime = get_seconds(); + } else { + sma->sem_base[sops[0].sem_num].sem_otime = + get_seconds(); + } +} + /** * do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue * @sma: semaphore array @@ -967,17 +985,10 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop } } } - if (otime) { - if (sops == NULL) { - sma->sem_base[0].sem_otime = get_seconds(); - } else { - sma->sem_base[sops[0].sem_num].sem_otime = - get_seconds(); - } - } + if (otime) + set_semotime(sma, sops); } - /* The following counts are associated to each semaphore: * semncnt number of tasks waiting on semval being nonzero * semzcnt number of tasks waiting on semval being zero @@ -1839,12 +1850,17 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, error = perform_atomic_semop(sma, sops, nsops, un, task_tgid_vnr(current)); - if (error <= 0) { - if (alter && error == 0) + if (error == 0) { + /* If the operation was successful, then do + * the required updates. + */ + if (alter) do_smart_update(sma, sops, nsops, 1, &tasks); - - goto out_unlock_free; + else + set_semotime(sma, sops); } + if (error <= 0) + goto out_unlock_free; /* We need to sleep on this operation, so we put the current * task into the pending queue and go to sleep. -- cgit v1.2.3 From 4271b05a227dc6175b66c3d9941aeab09048aeb2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 30 Sep 2013 13:45:26 -0700 Subject: ipc,msg: prevent race with rmid in msgsnd,msgrcv This fixes a race in both msgrcv() and msgsnd() between finding the msg and actually dealing with the queue, as another thread can delete shmid underneath us if we are preempted before acquiring the kern_ipc_perm.lock. Manfred illustrates this nicely: Assume a preemptible kernel that is preempted just after msq = msq_obtain_object_check(ns, msqid) in do_msgrcv(). The only lock that is held is rcu_read_lock(). Now the other thread processes IPC_RMID. When the first task is resumed, then it will happily wait for messages on a deleted queue. Fix this by checking for if the queue has been deleted after taking the lock. Signed-off-by: Davidlohr Bueso Reported-by: Manfred Spraul Cc: Rik van Riel Cc: Mike Galbraith Cc: [3.11] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'ipc') diff --git a/ipc/msg.c b/ipc/msg.c index 9e4310c546ae..558aa91186b6 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -695,6 +695,12 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, if (ipcperms(ns, &msq->q_perm, S_IWUGO)) goto out_unlock0; + /* raced with RMID? */ + if (msq->q_perm.deleted) { + err = -EIDRM; + goto out_unlock0; + } + err = security_msg_queue_msgsnd(msq, msg, msgflg); if (err) goto out_unlock0; @@ -901,6 +907,13 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl goto out_unlock1; ipc_lock_object(&msq->q_perm); + + /* raced with RMID? */ + if (msq->q_perm.deleted) { + msg = ERR_PTR(-EIDRM); + goto out_unlock0; + } + msg = find_msg(msq, &msgtyp, mode); if (!IS_ERR(msg)) { /* -- cgit v1.2.3