diff options
Diffstat (limited to 'sys-cluster/lustre/files/0002-LU-1337-vfs-kernel-3.1-kills-inode-i_alloc_sem.patch')
-rw-r--r-- | sys-cluster/lustre/files/0002-LU-1337-vfs-kernel-3.1-kills-inode-i_alloc_sem.patch | 446 |
1 files changed, 446 insertions, 0 deletions
diff --git a/sys-cluster/lustre/files/0002-LU-1337-vfs-kernel-3.1-kills-inode-i_alloc_sem.patch b/sys-cluster/lustre/files/0002-LU-1337-vfs-kernel-3.1-kills-inode-i_alloc_sem.patch new file mode 100644 index 000000000..04ccefe11 --- /dev/null +++ b/sys-cluster/lustre/files/0002-LU-1337-vfs-kernel-3.1-kills-inode-i_alloc_sem.patch @@ -0,0 +1,446 @@ +From 13a22da522d306d69511e439406d2f986a596adb Mon Sep 17 00:00:00 2001 +From: Liu Xuezhao <xuezhao.liu@emc.com> +Date: Sun, 22 Jul 2012 01:07:18 +0800 +Subject: [PATCH 02/13] LU-1337 vfs: kernel 3.1 kills inode->i_alloc_sem + +Kernel 3.1 kills inode->i_alloc_sem, use i_dio_count and +inode_dio_wait/inode_dio_done instead. +(kernel commit bd5fe6c5eb9c548d7f07fe8f89a150bb6705e8e3). + +Add HAVE_INODE_DIO_WAIT to differentiate it. +Add INODE_DIO_LOCK_WRITE/INODE_DIO_RELEASE_WRITE, + INODE_DIO_LOCK_READ/INODE_DIO_RELEASE_READ macros. + +Signed-off-by: Liu Xuezhao <xuezhao.liu@emc.com> +Change-Id: Ife36e07a85c76153985a4a86ee1973262c4c0e27 +--- + lustre/autoconf/lustre-core.m4 | 22 ++++++++++++ + lustre/include/linux/lustre_compat25.h | 18 ++++++---- + lustre/llite/llite_lib.c | 4 +-- + lustre/llite/vvp_io.c | 27 ++++++-------- + lustre/llite/vvp_page.c | 1 - + lustre/obdfilter/filter.c | 65 +++++++++++++++++----------------- + lustre/obdfilter/filter_io.c | 28 ++++++++------- + lustre/obdfilter/filter_io_26.c | 10 +++--- + lustre/osc/osc_cache.c | 6 ++-- + lustre/osd-ldiskfs/osd_io.c | 1 - + 10 files changed, 102 insertions(+), 80 deletions(-) + +diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 +index 403add5..0aef14f 100644 +--- a/lustre/autoconf/lustre-core.m4 ++++ b/lustre/autoconf/lustre-core.m4 +@@ -1850,6 +1850,27 @@ LB_LINUX_TRY_COMPILE([ + ]) + + # ++# 3.1 kills inode->i_alloc_sem, use i_dio_count and inode_dio_wait/ ++# inode_dio_done instead. ++# see kernel commit bd5fe6c5eb9c548d7f07fe8f89a150bb6705e8e3 ++# ++AC_DEFUN([LC_INODE_DIO_WAIT], ++[AC_MSG_CHECKING([if inode->i_alloc_sem is killed and use inode_dio_wait/done.]) ++LB_LINUX_TRY_COMPILE([ ++ #include <linux/fs.h> ++],[ ++ inode_dio_wait((struct inode *)0); ++ inode_dio_done((struct inode *)0); ++],[ ++ AC_DEFINE(HAVE_INODE_DIO_WAIT, 1, ++ [inode->i_alloc_sem is killed and use inode_dio_wait/done]) ++ AC_MSG_RESULT([yes]) ++],[ ++ AC_MSG_RESULT([no]) ++]) ++]) ++ ++# + # 3.3 introduces migrate_mode.h and migratepage has 4 args + # + AC_DEFUN([LC_HAVE_MIGRATE_HEADER], +@@ -2035,6 +2056,7 @@ AC_DEFUN([LC_PROG_LINUX], + + # 3.1 + LC_LM_XXX_LOCK_MANAGER_OPS ++ LC_INODE_DIO_WAIT + + # 3.3 + LC_HAVE_MIGRATE_HEADER +diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h +index 41b4516..bb45125 100644 +--- a/lustre/include/linux/lustre_compat25.h ++++ b/lustre/include/linux/lustre_compat25.h +@@ -269,13 +269,17 @@ static inline int mapping_has_pages(struct address_space *mapping) + (type *)( (char *)__mptr - offsetof(type,member) );}) + #endif + +-#define UP_WRITE_I_ALLOC_SEM(i) up_write(&(i)->i_alloc_sem) +-#define DOWN_WRITE_I_ALLOC_SEM(i) down_write(&(i)->i_alloc_sem) +-#define LASSERT_I_ALLOC_SEM_WRITE_LOCKED(i) LASSERT(down_read_trylock(&(i)->i_alloc_sem) == 0) +- +-#define UP_READ_I_ALLOC_SEM(i) up_read(&(i)->i_alloc_sem) +-#define DOWN_READ_I_ALLOC_SEM(i) down_read(&(i)->i_alloc_sem) +-#define LASSERT_I_ALLOC_SEM_READ_LOCKED(i) LASSERT(down_write_trylock(&(i)->i_alloc_sem) == 0) ++#ifdef HAVE_INODE_DIO_WAIT ++# define INODE_DIO_LOCK_WRITE(i) inode_dio_wait(i) ++# define INODE_DIO_RELEASE_WRITE(i) do {} while (0) ++# define INODE_DIO_LOCK_READ(i) atomic_inc(&(i)->i_dio_count) ++# define INODE_DIO_RELEASE_READ(i) inode_dio_done(i) ++#else ++# define INODE_DIO_LOCK_WRITE(i) down_write(&(i)->i_alloc_sem) ++# define INODE_DIO_RELEASE_WRITE(i) up_write(&(i)->i_alloc_sem) ++# define INODE_DIO_LOCK_READ(i) down_read(&(i)->i_alloc_sem) ++# define INODE_DIO_RELEASE_READ(i) up_read(&(i)->i_alloc_sem) ++#endif + + #include <linux/mpage.h> /* for generic_writepages */ + +diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c +index 9980c2c..7f3ac28 100644 +--- a/lustre/llite/llite_lib.c ++++ b/lustre/llite/llite_lib.c +@@ -1439,12 +1439,12 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr) + + if (!S_ISDIR(inode->i_mode)) { + if (ia_valid & ATTR_SIZE) +- UP_WRITE_I_ALLOC_SEM(inode); ++ INODE_DIO_RELEASE_WRITE(inode); + mutex_unlock(&inode->i_mutex); + cfs_down_write(&lli->lli_trunc_sem); + mutex_lock(&inode->i_mutex); + if (ia_valid & ATTR_SIZE) +- DOWN_WRITE_I_ALLOC_SEM(inode); ++ INODE_DIO_LOCK_WRITE(inode); + } + + /* We need a steady stripe configuration for setattr to avoid +diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c +index 50a19c9..0b4cfb5 100644 +--- a/lustre/llite/vvp_io.c ++++ b/lustre/llite/vvp_io.c +@@ -295,7 +295,7 @@ static int vvp_io_setattr_iter_init(const struct lu_env *env, + */ + mutex_unlock(&inode->i_mutex); + if (cl_io_is_trunc(ios->cis_io)) +- UP_WRITE_I_ALLOC_SEM(inode); ++ INODE_DIO_RELEASE_WRITE(inode); + cio->u.setattr.cui_locks_released = 1; + return 0; + } +@@ -348,7 +348,7 @@ static int vvp_io_setattr_trunc(const struct lu_env *env, + const struct cl_io_slice *ios, + struct inode *inode, loff_t size) + { +- DOWN_WRITE_I_ALLOC_SEM(inode); ++ INODE_DIO_LOCK_WRITE(inode); + return 0; + } + +@@ -420,7 +420,7 @@ static void vvp_io_setattr_fini(const struct lu_env *env, + if (cio->u.setattr.cui_locks_released) { + mutex_lock(&inode->i_mutex); + if (cl_io_is_trunc(io)) +- DOWN_WRITE_I_ALLOC_SEM(inode); ++ INODE_DIO_LOCK_WRITE(inode); + cio->u.setattr.cui_locks_released = 0; + } + vvp_io_fini(env, ios); +@@ -689,28 +689,26 @@ static int vvp_io_fault_start(const struct lu_env *env, + + /* must return locked page */ + if (fio->ft_mkwrite) { +- /* we grab alloc_sem to exclude truncate case. +- * Otherwise, we could add dirty pages into osc cache +- * while truncate is on-going. */ +- DOWN_READ_I_ALLOC_SEM(inode); +- +- LASSERT(cfio->ft_vmpage != NULL); +- lock_page(cfio->ft_vmpage); ++ LASSERT(cfio->ft_vmpage != NULL); ++ lock_page(cfio->ft_vmpage); + } else { + result = vvp_io_kernel_fault(cfio); + if (result != 0) + return result; + } + +- vmpage = cfio->ft_vmpage; +- LASSERT(PageLocked(vmpage)); ++ vmpage = cfio->ft_vmpage; ++ LASSERT(PageLocked(vmpage)); + + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE)) + ll_invalidate_page(vmpage); + ++ ++ size = i_size_read(inode); + /* Though we have already held a cl_lock upon this page, but + * it still can be truncated locally. */ +- if (unlikely(vmpage->mapping == NULL)) { ++ if (unlikely((vmpage->mapping != inode->i_mapping) || ++ (page_offset(vmpage) > size))) { + CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n"); + + /* return +1 to stop cl_io_loop() and ll_fault() will catch +@@ -758,7 +756,6 @@ static int vvp_io_fault_start(const struct lu_env *env, + } + } + +- size = i_size_read(inode); + last = cl_index(obj, size - 1); + LASSERT(fio->ft_index <= last); + if (fio->ft_index == last) +@@ -777,8 +774,6 @@ out: + /* return unlocked vmpage to avoid deadlocking */ + if (vmpage != NULL) + unlock_page(vmpage); +- if (fio->ft_mkwrite) +- UP_READ_I_ALLOC_SEM(inode); + #ifdef HAVE_VM_OP_FAULT + cfio->fault.ft_flags &= ~VM_FAULT_LOCKED; + #endif +diff --git a/lustre/llite/vvp_page.c b/lustre/llite/vvp_page.c +index b428744..66842a0 100644 +--- a/lustre/llite/vvp_page.c ++++ b/lustre/llite/vvp_page.c +@@ -420,7 +420,6 @@ static void vvp_transient_page_verify(const struct cl_page *page) + struct inode *inode = ccc_object_inode(page->cp_obj); + + LASSERT(!mutex_trylock(&inode->i_mutex)); +- /* LASSERT_SEM_LOCKED(&inode->i_alloc_sem); */ + } + + static int vvp_transient_page_own(const struct lu_env *env, +diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c +index 23a992e..a93501d 100644 +--- a/lustre/obdfilter/filter.c ++++ b/lustre/obdfilter/filter.c +@@ -3343,13 +3343,13 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry, + } + if (ia_valid & (ATTR_SIZE | ATTR_UID | ATTR_GID)) { + unsigned long now = jiffies; +- /* Filter truncates and writes are serialized by +- * i_alloc_sem, see the comment in +- * filter_preprw_write.*/ +- if (ia_valid & ATTR_SIZE) +- down_write(&inode->i_alloc_sem); ++ /* Filter truncates and writes are serialized. ++ * See the comment in filter_preprw_write.*/ + mutex_lock(&inode->i_mutex); +- fsfilt_check_slow(exp->exp_obd, now, "i_alloc_sem and i_mutex"); ++ if (ia_valid & ATTR_SIZE) ++ INODE_DIO_LOCK_WRITE(inode); ++ fsfilt_check_slow(exp->exp_obd, now, ++ "i_mutex and INODE_DIO_LOCK_WRITE"); + old_size = i_size_read(inode); + } + +@@ -3473,7 +3473,7 @@ out_unlock: + if (ia_valid & (ATTR_SIZE | ATTR_UID | ATTR_GID)) + mutex_unlock(&inode->i_mutex); + if (ia_valid & ATTR_SIZE) +- up_write(&inode->i_alloc_sem); ++ INODE_DIO_RELEASE_WRITE(inode); + if (fcc) + OBD_FREE(fcc, sizeof(*fcc)); + +@@ -3554,14 +3554,14 @@ int filter_setattr(const struct lu_env *env, struct obd_export *exp, + */ + if (oa->o_valid & + (OBD_MD_FLMTIME | OBD_MD_FLATIME | OBD_MD_FLCTIME)) { +- unsigned long now = jiffies; +- down_write(&dentry->d_inode->i_alloc_sem); +- fsfilt_check_slow(exp->exp_obd, now, "i_alloc_sem"); +- fmd = filter_fmd_get(exp, oa->o_id, oa->o_seq); +- if (fmd && fmd->fmd_mactime_xid < oti->oti_xid) +- fmd->fmd_mactime_xid = oti->oti_xid; +- filter_fmd_put(exp, fmd); +- up_write(&dentry->d_inode->i_alloc_sem); ++ unsigned long now = jiffies; ++ INODE_DIO_LOCK_WRITE(dentry->d_inode); ++ fsfilt_check_slow(exp->exp_obd, now, "INODE_DIO_LOCK_WRITE"); ++ fmd = filter_fmd_get(exp, oa->o_id, oa->o_seq); ++ if (fmd && fmd->fmd_mactime_xid < oti->oti_xid) ++ fmd->fmd_mactime_xid = oti->oti_xid; ++ filter_fmd_put(exp, fmd); ++ INODE_DIO_RELEASE_WRITE(dentry->d_inode); + } + + /* setting objects attributes (including owner/group) */ +@@ -4292,28 +4292,29 @@ int filter_destroy(const struct lu_env *env, struct obd_export *exp, + *fcc = oa->o_lcookie; + } + +- /* we're gonna truncate it first in order to avoid possible deadlock: +- * P1 P2 +- * open trasaction open transaction +- * down(i_zombie) down(i_zombie) +- * restart transaction +- * (see BUG 4180) -bzzz +- * +- * take i_alloc_sem too to prevent other threads from writing to the +- * file while we are truncating it. This can cause lock ordering issue +- * between page lock, i_mutex & starting new journal handle. +- * (see bug 20321) -johann +- */ ++ /* we're gonna truncate it first in order to avoid possible deadlock: ++ * P1 P2 ++ * open trasaction open transaction ++ * down(i_zombie) down(i_zombie) ++ * restart transaction ++ * (see BUG 4180) -bzzz ++ * ++ * INODE_DIO_LOCK_WRITE too to prevent other threads from writing to the ++ * file while we are truncating it. This can cause lock ordering issue ++ * between page lock, i_mutex & starting new journal handle. ++ * (see bug 20321) -johann ++ */ + now = jiffies; +- down_write(&dchild->d_inode->i_alloc_sem); ++ INODE_DIO_LOCK_WRITE(dchild->d_inode); + mutex_lock(&dchild->d_inode->i_mutex); +- fsfilt_check_slow(exp->exp_obd, now, "i_alloc_sem and i_mutex"); ++ fsfilt_check_slow(exp->exp_obd, now, ++ "INODE_DIO_LOCK_WRITE and i_mutex"); + + /* VBR: version recovery check */ + rc = filter_version_get_check(exp, oti, dchild->d_inode); + if (rc) { + mutex_unlock(&dchild->d_inode->i_mutex); +- up_write(&dchild->d_inode->i_alloc_sem); ++ INODE_DIO_RELEASE_WRITE(dchild->d_inode); + GOTO(cleanup, rc); + } + +@@ -4321,7 +4322,7 @@ int filter_destroy(const struct lu_env *env, struct obd_export *exp, + NULL, 1); + if (IS_ERR(handle)) { + mutex_unlock(&dchild->d_inode->i_mutex); +- up_write(&dchild->d_inode->i_alloc_sem); ++ INODE_DIO_RELEASE_WRITE(dchild->d_inode); + GOTO(cleanup, rc = PTR_ERR(handle)); + } + +@@ -4333,7 +4334,7 @@ int filter_destroy(const struct lu_env *env, struct obd_export *exp, + rc = fsfilt_setattr(obd, dchild, handle, &iattr, 1); + rc2 = fsfilt_commit(obd, dchild->d_inode, handle, 0); + mutex_unlock(&dchild->d_inode->i_mutex); +- up_write(&dchild->d_inode->i_alloc_sem); ++ INODE_DIO_RELEASE_WRITE(dchild->d_inode); + if (rc) + GOTO(cleanup, rc); + if (rc2) +diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c +index a946d90..3c4dcad 100644 +--- a/lustre/obdfilter/filter_io.c ++++ b/lustre/obdfilter/filter_io.c +@@ -634,7 +634,8 @@ static int filter_grant_check(struct obd_export *exp, struct obdo *oa, + * on mulitple inodes. That isn't all, because there still exists the + * possibility of a truncate starting a new transaction while holding the ext3 + * rwsem = write while some writes (which have started their transactions here) +- * blocking on the ext3 rwsem = read => lock inversion. ++ * blocking on the ext3 rwsem = read => lock inversion. (kernel 3.1 kills the ++ * rwsem and replaces it by i_dio_count and inode_dio_wait/done.) + * + * The handling gets very ugly when dealing with locked pages. It may be easier + * to just get rid of the locked page code (which has problems of its own) and +@@ -730,14 +731,15 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, + + fsfilt_check_slow(obd, now, "preprw_write setup"); + +- /* Filter truncate first locks i_mutex then partially truncated +- * page, filter write code first locks pages then take +- * i_mutex. To avoid a deadlock in case of concurrent +- * punch/write requests from one client, filter writes and +- * filter truncates are serialized by i_alloc_sem, allowing +- * multiple writes or single truncate. */ +- down_read(&dentry->d_inode->i_alloc_sem); +- fsfilt_check_slow(obd, now, "i_alloc_sem"); ++ /* Filter truncate first locks i_mutex then partially truncated ++ * page, filter write code first locks pages then take ++ * i_mutex. To avoid a deadlock in case of concurrent ++ * punch/write requests from one client, filter writes and ++ * filter truncates are serialized by INODE_DIO_LOCK_READ, allowing ++ * multiple writes or single truncate. */ ++ ++ INODE_DIO_LOCK_READ(dentry->d_inode); ++ fsfilt_check_slow(obd, now, "INODE_DIO_LOCK_READ"); + + /* Don't update inode timestamps if this write is older than a + * setattr which modifies the timestamps. b=10150 */ +@@ -895,11 +897,11 @@ cleanup: + } + } + } +- case 3: +- if (rc) +- up_read(&dentry->d_inode->i_alloc_sem); ++ case 3: ++ if (rc) ++ INODE_DIO_RELEASE_READ(dentry->d_inode); + +- filter_iobuf_put(&obd->u.filter, iobuf, oti); ++ filter_iobuf_put(&obd->u.filter, iobuf, oti); + case 2: + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + if (rc) +diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c +index 853e4f5..6d2b89f 100644 +--- a/lustre/obdfilter/filter_io_26.c ++++ b/lustre/obdfilter/filter_io_26.c +@@ -635,10 +635,10 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, + LASSERT(PageLocked(lnb->page)); + LASSERT(!PageWriteback(lnb->page)); + +- /* since write & truncate are serialized by the i_alloc_sem, +- * even partial truncate should not leave dirty pages in +- * the page cache */ +- LASSERT(!PageDirty(lnb->page)); ++ /* since write & truncate are serialized by the inode_dio_wait, ++ * even partial truncate should not leave dirty pages in ++ * the page cache */ ++ LASSERT(!PageDirty(lnb->page)); + + SetPageUptodate(lnb->page); + +@@ -867,7 +867,7 @@ cleanup: + if (fo->fo_writethrough_cache == 0 || + i_size_read(inode) > fo->fo_readcache_max_filesize) + filter_release_cache(obd, obj, nb, inode); +- up_read(&inode->i_alloc_sem); ++ INODE_DIO_RELEASE_READ(inode); + } + + RETURN(rc); +diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c +index 23dc755..b42e41c 100644 +--- a/lustre/osc/osc_cache.c ++++ b/lustre/osc/osc_cache.c +@@ -2695,9 +2695,9 @@ void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio, + * The caller must have called osc_cache_writeback_range() to issue IO + * otherwise it will take a long time for this function to finish. + * +- * Caller must hold inode_mutex and i_alloc_sem, or cancel exclusive +- * dlm lock so that nobody else can dirty this range of file while we're +- * waiting for extents to be written. ++ * Caller must hold inode_mutex , or cancel exclusive dlm lock so that ++ * nobody else can dirty this range of file while we're waiting for ++ * extents to be written. + */ + int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end) +diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c +index af3d6af..d266805 100644 +--- a/lustre/osd-ldiskfs/osd_io.c ++++ b/lustre/osd-ldiskfs/osd_io.c +@@ -433,7 +433,6 @@ struct page *osd_get_page(struct dt_object *dt, loff_t offset, int rw) + /* + * there are following "locks": + * journal_start +- * i_alloc_sem + * i_mutex + * page lock + +-- +1.7.12 + |