From d9104d1ca9662498339c0de975b4666c30485f4e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 11 Sep 2013 14:22:24 -0700 Subject: mm: track vma changes with VM_SOFTDIRTY bit Pavel reported that in case if vma area get unmapped and then mapped (or expanded) in-place, the soft dirty tracker won't be able to recognize this situation since it works on pte level and ptes are get zapped on unmap, loosing soft dirty bit of course. So to resolve this situation we need to track actions on vma level, there VM_SOFTDIRTY flag comes in. When new vma area created (or old expanded) we set this bit, and keep it here until application calls for clearing soft dirty bit. Thus when user space application track memory changes now it can detect if vma area is renewed. Reported-by: Pavel Emelyanov Signed-off-by: Cyrill Gorcunov Cc: Andy Lutomirski Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Cc: Rob Landley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index fd774c7cb483..2d1e52a58fe9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -266,7 +266,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; + vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); -- cgit v1.2.3 From 5d1baf3b63bfc8c709dc44df85ff1475c7ef489d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:38 -0700 Subject: exec: introduce exec_binprm() for "depth == 0" code task_pid_nr_ns() and trace/ptrace code in the middle of the recursive search_binary_handler() looks confusing and imho annoying. We only need this code if "depth == 0", lets add a simple helper which calls search_binary_handler() and does trace_sched_process_exec() + ptrace_event(). The patch also moves the setting of task->did_exec, we need to do this only once. Note: we can kill either task->did_exec or PF_FORKNOEXEC. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 2d1e52a58fe9..4d95b4709ea0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1373,7 +1373,6 @@ int search_binary_handler(struct linux_binprm *bprm) unsigned int depth = bprm->recursion_depth; int try,retval; struct linux_binfmt *fmt; - pid_t old_pid, old_vpid; /* This allows 4 levels of binfmt rewrites before failing hard. */ if (depth > 5) @@ -1387,12 +1386,6 @@ int search_binary_handler(struct linux_binprm *bprm) if (retval) return retval; - /* Need to fetch pid before load_binary changes it */ - old_pid = current->pid; - rcu_read_lock(); - old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); - rcu_read_unlock(); - retval = -ENOENT; for (try=0; try<2; try++) { read_lock(&binfmt_lock); @@ -1407,16 +1400,11 @@ int search_binary_handler(struct linux_binprm *bprm) retval = fn(bprm); bprm->recursion_depth = depth; if (retval >= 0) { - if (depth == 0) { - trace_sched_process_exec(current, old_pid, bprm); - ptrace_event(PTRACE_EVENT_EXEC, old_vpid); - } put_binfmt(fmt); allow_write_access(bprm->file); if (bprm->file) fput(bprm->file); bprm->file = NULL; - current->did_exec = 1; proc_exec_connector(current); return retval; } @@ -1450,9 +1438,29 @@ int search_binary_handler(struct linux_binprm *bprm) } return retval; } - EXPORT_SYMBOL(search_binary_handler); +static int exec_binprm(struct linux_binprm *bprm) +{ + pid_t old_pid, old_vpid; + int ret; + + /* Need to fetch pid before load_binary changes it */ + old_pid = current->pid; + rcu_read_lock(); + old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); + rcu_read_unlock(); + + ret = search_binary_handler(bprm); + if (ret >= 0) { + trace_sched_process_exec(current, old_pid, bprm); + ptrace_event(PTRACE_EVENT_EXEC, old_vpid); + current->did_exec = 1; + } + + return ret; +} + /* * sys_execve() executes a new program. */ @@ -1541,7 +1549,7 @@ static int do_execve_common(const char *filename, if (retval < 0) goto out; - retval = search_binary_handler(bprm); + retval = exec_binprm(bprm); if (retval < 0) goto out; -- cgit v1.2.3 From 131b2f9f1214f338f0bf7c0d9760019f2b1d0c20 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:39 -0700 Subject: exec: kill "int depth" in search_binary_handler() Nobody except search_binary_handler() should touch ->recursion_depth, "int depth" buys nothing but complicates the code, kill it. Probably we should also kill "fn" and the !NULL check, ->load_binary should be always defined. And it can not go away after read_unlock() or this code is buggy anyway. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 4d95b4709ea0..b6e35ec818a2 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1370,12 +1370,11 @@ EXPORT_SYMBOL(remove_arg_zero); */ int search_binary_handler(struct linux_binprm *bprm) { - unsigned int depth = bprm->recursion_depth; - int try,retval; + int try, retval; struct linux_binfmt *fmt; /* This allows 4 levels of binfmt rewrites before failing hard. */ - if (depth > 5) + if (bprm->recursion_depth > 5) return -ELOOP; retval = security_bprm_check(bprm); @@ -1396,9 +1395,9 @@ int search_binary_handler(struct linux_binprm *bprm) if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); - bprm->recursion_depth = depth + 1; + bprm->recursion_depth++; retval = fn(bprm); - bprm->recursion_depth = depth; + bprm->recursion_depth--; if (retval >= 0) { put_binfmt(fmt); allow_write_access(bprm->file); -- cgit v1.2.3 From 9beb266f2d7e5362c5bb9f999255aa1af5318aef Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:40 -0700 Subject: exec: proc_exec_connector() should be called only once A separate one-liner with the minor fix. PROC_EVENT_EXEC reports the "exec" event, but this message is sent at least twice if search_binary_handler() is called by ->load_binary() recursively, say, load_script(). Move it to exec_binprm(), this is "depth == 0" code too. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index b6e35ec818a2..d51f7172832b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1404,7 +1404,6 @@ int search_binary_handler(struct linux_binprm *bprm) if (bprm->file) fput(bprm->file); bprm->file = NULL; - proc_exec_connector(current); return retval; } read_lock(&binfmt_lock); @@ -1455,6 +1454,7 @@ static int exec_binprm(struct linux_binprm *bprm) trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); current->did_exec = 1; + proc_exec_connector(current); } return ret; -- cgit v1.2.3 From 52f14282bb0c3d3e5ba2a9eaacb12ff37a033e7e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:41 -0700 Subject: exec: move allow_write_access/fput to exec_binprm() When search_binary_handler() succeeds it does allow_write_access() and fput(), then it clears bprm->file to ensure the caller will not do the same. We can simply move this code to exec_binprm() which is called only once. In fact we could move this to free_bprm() and remove the same code in do_execve_common's error path. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index d51f7172832b..a4cfd1d725e0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1400,10 +1400,6 @@ int search_binary_handler(struct linux_binprm *bprm) bprm->recursion_depth--; if (retval >= 0) { put_binfmt(fmt); - allow_write_access(bprm->file); - if (bprm->file) - fput(bprm->file); - bprm->file = NULL; return retval; } read_lock(&binfmt_lock); @@ -1455,6 +1451,12 @@ static int exec_binprm(struct linux_binprm *bprm) ptrace_event(PTRACE_EVENT_EXEC, old_vpid); current->did_exec = 1; proc_exec_connector(current); + + if (bprm->file) { + allow_write_access(bprm->file); + fput(bprm->file); + bprm->file = NULL; /* to catch use-after-free */ + } } return ret; -- cgit v1.2.3 From 92eaa565add62d56b90987f58ea9feafc5a7c183 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:42 -0700 Subject: exec: kill ->load_binary != NULL check in search_binary_handler() search_binary_handler() checks ->load_binary != NULL for no reason, this method should be always defined. Turn this check into WARN_ON() and move it into __register_binfmt(). Also, kill the function pointer. The current code looks confusing, as if ->load_binary can go away after read_unlock(&binfmt_lock). But we rely on module_get(fmt->module), this fmt can't be changed or unregistered, otherwise this code is buggy anyway. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index a4cfd1d725e0..7b92fbfa63aa 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -74,6 +74,8 @@ static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { BUG_ON(!fmt); + if (WARN_ON(!fmt->load_binary)) + return; write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); @@ -1389,14 +1391,11 @@ int search_binary_handler(struct linux_binprm *bprm) for (try=0; try<2; try++) { read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { - int (*fn)(struct linux_binprm *) = fmt->load_binary; - if (!fn) - continue; if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); bprm->recursion_depth++; - retval = fn(bprm); + retval = fmt->load_binary(bprm); bprm->recursion_depth--; if (retval >= 0) { put_binfmt(fmt); -- cgit v1.2.3 From cb7b6b1cbc20a970c7124efae1c2478155604b54 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:44 -0700 Subject: exec: cleanup the CONFIG_MODULES logic search_binary_handler() uses "for (try=0; try<2; try++)" to avoid "goto" but the code looks too complicated and horrible imho. We still need to check "try == 0" before request_module() and add the additional "break" for !CONFIG_MODULES case. Kill this loop and use a simple "bool need_retry" + "goto retry". The code looks much simpler and we do not even need ifdef's, gcc can optimize out the "if (need_retry)" block if !IS_ENABLED(). Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 68 +++++++++++++++++++++++++++++---------------------------------- 1 file changed, 31 insertions(+), 37 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 7b92fbfa63aa..ba357e6aea98 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1367,13 +1367,15 @@ out: } EXPORT_SYMBOL(remove_arg_zero); +#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) /* * cycle the list of binary formats handler, until one recognizes the image */ int search_binary_handler(struct linux_binprm *bprm) { - int try, retval; + bool need_retry = IS_ENABLED(CONFIG_MODULES); struct linux_binfmt *fmt; + int retval; /* This allows 4 levels of binfmt rewrites before failing hard. */ if (bprm->recursion_depth > 5) @@ -1388,47 +1390,39 @@ int search_binary_handler(struct linux_binprm *bprm) return retval; retval = -ENOENT; - for (try=0; try<2; try++) { - read_lock(&binfmt_lock); - list_for_each_entry(fmt, &formats, lh) { - if (!try_module_get(fmt->module)) - continue; - read_unlock(&binfmt_lock); - bprm->recursion_depth++; - retval = fmt->load_binary(bprm); - bprm->recursion_depth--; - if (retval >= 0) { - put_binfmt(fmt); - return retval; - } - read_lock(&binfmt_lock); + retry: + read_lock(&binfmt_lock); + list_for_each_entry(fmt, &formats, lh) { + if (!try_module_get(fmt->module)) + continue; + read_unlock(&binfmt_lock); + bprm->recursion_depth++; + retval = fmt->load_binary(bprm); + bprm->recursion_depth--; + if (retval >= 0) { put_binfmt(fmt); - if (retval != -ENOEXEC || bprm->mm == NULL) - break; - if (!bprm->file) { - read_unlock(&binfmt_lock); - return retval; - } + return retval; } - read_unlock(&binfmt_lock); -#ifdef CONFIG_MODULES - if (retval != -ENOEXEC || bprm->mm == NULL) { + read_lock(&binfmt_lock); + put_binfmt(fmt); + if (retval != -ENOEXEC || bprm->mm == NULL) break; - } else { -#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) - if (printable(bprm->buf[0]) && - printable(bprm->buf[1]) && - printable(bprm->buf[2]) && - printable(bprm->buf[3])) - break; /* -ENOEXEC */ - if (try) - break; /* -ENOEXEC */ - request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); + if (!bprm->file) { + read_unlock(&binfmt_lock); + return retval; } -#else - break; -#endif } + read_unlock(&binfmt_lock); + + if (need_retry && retval == -ENOEXEC && bprm->mm) { + if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && + printable(bprm->buf[2]) && printable(bprm->buf[3])) + return retval; + request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)); + need_retry = false; + goto retry; + } + return retval; } EXPORT_SYMBOL(search_binary_handler); -- cgit v1.2.3 From 4e0621a07ea58a0dc15859be3b743bdeb194a51b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:45 -0700 Subject: exec: don't retry if request_module() fails A separate one-liner for better documentation. It doesn't make sense to retry if request_module() fails to exec /sbin/modprobe, add the additional "request_module() < 0" check. However, this logic still doesn't look exactly right: 1. It would be better to check "request_module() != 0", the user space modprobe process should report the correct exit code. But I didn't dare to add the user-visible change. 2. The whole ENOEXEC logic looks suboptimal. Suppose that we try to exec a "#!path-to-unsupported-binary" script. In this case request_module() + "retry" will be done twice: first by the "depth == 1" code, and then again by the "depth == 0" caller which doesn't make sense. 3. And note that in the case above bprm->buf was already changed by load_script()->prepare_binprm(), so this looks even more ugly. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index ba357e6aea98..635b586de336 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1418,7 +1418,8 @@ int search_binary_handler(struct linux_binprm *bprm) if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && printable(bprm->buf[2]) && printable(bprm->buf[3])) return retval; - request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)); + if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0) + return retval; need_retry = false; goto retry; } -- cgit v1.2.3 From 6b3c538f5b2cfc53cb6803ec5001bbcf8f18a98e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:46 -0700 Subject: exec: cleanup the error handling in search_binary_handler() The error hanling and ret-from-loop look confusing and inconsistent. - "retval >= 0" simply returns - "!bprm->file" returns too but with read_unlock() because binfmt_lock was already re-acquired - "retval != -ENOEXEC || bprm->mm == NULL" does "break" and relies on the same check after the main loop Consolidate these checks into a single if/return statement. need_retry still checks "retval == -ENOEXEC", but this and -ENOENT before the main loop are not needed. This is only for pathological and impossible list_empty(&formats) case. It is not clear why do we check "bprm->mm == NULL", probably this should be removed. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 635b586de336..8875dd10ae7a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1399,22 +1399,17 @@ int search_binary_handler(struct linux_binprm *bprm) bprm->recursion_depth++; retval = fmt->load_binary(bprm); bprm->recursion_depth--; - if (retval >= 0) { + if (retval >= 0 || retval != -ENOEXEC || + bprm->mm == NULL || bprm->file == NULL) { put_binfmt(fmt); return retval; } read_lock(&binfmt_lock); put_binfmt(fmt); - if (retval != -ENOEXEC || bprm->mm == NULL) - break; - if (!bprm->file) { - read_unlock(&binfmt_lock); - return retval; - } } read_unlock(&binfmt_lock); - if (need_retry && retval == -ENOEXEC && bprm->mm) { + if (need_retry && retval == -ENOEXEC) { if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && printable(bprm->buf[2]) && printable(bprm->buf[3])) return retval; -- cgit v1.2.3