From 4918b6d140c4822201ebbe2f070875332aff337b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:26:07 -0700 Subject: ceph: add F_SYNC file flag to force sync (non-O_DIRECT) io This allows us to force IO through the sync path which you normally only get when multiple clients are reading/writing to the same file or by mounting with -o sync. Among other things, this lets test programs verify correctness with a single mount. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/ceph/file.c') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4698a5c553dc..44e4fe9fba02 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -643,7 +643,8 @@ again: if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS)) + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) /* hmm, this isn't really async... */ ret = ceph_sync_read(filp, base, len, ppos, &checkeof); else @@ -720,7 +721,8 @@ retry_snap: if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) { ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, &iocb->ki_pos); } else { -- cgit v1.2.3 From d8de9ab63a57326d21154c13c365f949f53ce8e1 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:27:34 -0700 Subject: ceph: avoid carrying Fw cap during write into page cache The generic_file_aio_write call may block on balance_dirty_pages while we flush data to the OSDs. If we hold a reference to the FILE_WR cap during that interval revocation by the MDS (e.g., to do a stat(2)) may be very slow. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/file.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'fs/ceph/file.c') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 44e4fe9fba02..6c90cf090601 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -713,7 +713,7 @@ retry_snap: want = CEPH_CAP_FILE_BUFFER; ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); if (ret < 0) - goto out; + goto out_put; dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, @@ -726,8 +726,18 @@ retry_snap: ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, &iocb->ki_pos); } else { - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + /* + * buffered write; drop Fw early to avoid slow + * revocation if we get stuck on balance_dirty_pages + */ + int dirty; + + spin_lock(&inode->i_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&inode->i_lock); + ceph_put_cap_refs(ci, got); + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); if ((ret >= 0 || ret == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { @@ -735,7 +745,12 @@ retry_snap: if (err < 0) ret = err; } + + if (dirty) + __mark_inode_dirty(inode, dirty); + goto out; } + if (ret >= 0) { int dirty; spin_lock(&inode->i_lock); @@ -745,12 +760,13 @@ retry_snap: __mark_inode_dirty(inode, dirty); } -out: +out_put: dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, ceph_cap_string(got)); ceph_put_cap_refs(ci, got); +out: if (ret == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); -- cgit v1.2.3 From acda76578813ef893004ecad0e5ad2bb6039e5f7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:27:48 -0700 Subject: ceph: fix bad parent_inode calc in ceph_lookup_open We were always getting NULL here because the intent file f_dentry is always NULL at this point, which means we were always passing NULL to ceph_mdsc_do_request. In reality, this was fine, since this isn't currently ever a write operation that needs to get strung on the dir's unsafe list. Use the dir explicitly, and only pass it if this open has side-effects that a dir fsync should flush. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/ceph/file.c') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6c90cf090601..9b667e9abf4c 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -223,7 +223,6 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct file *file = nd->intent.open.file; - struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); struct ceph_mds_request *req; int err; int flags = nd->intent.open.flags - 1; /* silly vfs! */ @@ -242,7 +241,9 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, req->r_dentry_unless = CEPH_CAP_FILE_EXCL; } req->r_locked_dir = dir; /* caller holds dir->i_mutex */ - err = ceph_mdsc_do_request(mdsc, parent_inode, req); + err = ceph_mdsc_do_request(mdsc, + (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, + req); dentry = ceph_finish_lookup(req, dentry, err); if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); -- cgit v1.2.3 From 9bae113a085b790de384bf86f09e15b42a65a985 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:27:59 -0700 Subject: ceph: only link open operations to directory unsafe list if O_CREAT|O_TRUNC We only need to put these on the directory unsafe list if they have side effects that fsync(2) should flush out. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/ceph/file.c') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 9b667e9abf4c..e0115eb4e9ba 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -194,7 +194,8 @@ int ceph_open(struct inode *inode, struct file *file) req->r_inode = inode; ihold(inode); req->r_num_caps = 1; - err = ceph_mdsc_do_request(mdsc, parent_inode, req); + err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? + parent_inode : NULL, req); if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); -- cgit v1.2.3 From 468640e32c7f6bfdaaa011095cc388786755d159 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:28:11 -0700 Subject: ceph: fix ceph_lookup_open intent usage We weren't properly calling lookup_instantiate_filp when setting up the lookup intent, which could lead to file leakage on errors. So: - use separate helper for the hidden snapdir translation, immediately following the mds request - use ceph_finish_lookup for the final dentry/return value dance in the exit path - lookup_instantiate_filp on success Reported-by: Al Viro Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/file.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'fs/ceph/file.c') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e0115eb4e9ba..f34d47d66e7c 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -223,8 +223,9 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; - struct file *file = nd->intent.open.file; + struct file *file; struct ceph_mds_request *req; + struct dentry *ret; int err; int flags = nd->intent.open.flags - 1; /* silly vfs! */ @@ -245,15 +246,21 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); - dentry = ceph_finish_lookup(req, dentry, err); - if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + err = ceph_handle_snapdir(req, dentry, err); + if (err) + goto out; + if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); - if (!err) - err = ceph_init_file(req->r_dentry->d_inode, file, - req->r_fmode); + if (err) + goto out; + file = lookup_instantiate_filp(nd, req->r_dentry, ceph_open); + if (IS_ERR(file)) + err = PTR_ERR(file); +out: + ret = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); - dout("ceph_lookup_open result=%p\n", dentry); - return dentry; + dout("ceph_lookup_open result=%p\n", ret); + return ret; } int ceph_release(struct inode *inode, struct file *file) -- cgit v1.2.3 From 5f21c96dd5c615341963036ae8f5e4f5227a818d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:30:29 -0700 Subject: ceph: protect access to d_parent d_parent is protected by d_lock: use it when looking up a dentry's parent directory inode. Also take a reference and drop it in the caller to avoid a use-after-free. Reported-by: Al Viro Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/file.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/ceph/file.c') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f34d47d66e7c..45fbd69daabe 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -122,7 +122,7 @@ int ceph_open(struct inode *inode, struct file *file) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct ceph_file_info *cf = file->private_data; - struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + struct inode *parent_inode = NULL; int err; int flags, fmode, wanted; @@ -194,8 +194,10 @@ int ceph_open(struct inode *inode, struct file *file) req->r_inode = inode; ihold(inode); req->r_num_caps = 1; - err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? - parent_inode : NULL, req); + if (flags & (O_CREAT|O_TRUNC)) + parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); -- cgit v1.2.3