From 61592da5eceb5e5945eb9c9499669cafded139bd Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 5 Dec 2022 13:31:20 +0100 Subject: [PATCH 1/2] sys-kernel/coreos-sources: Add backport of bugfix for #847 Users reported a deadlock in ext4 that occurs under loads after kernel 5.15.72. We debugged and found that this issue is also present upstream (6.x) and found a fix. The fix has been validated to fix the issue, but we're still waiting for a reponse from the ext4 maintainer. In the meantime, apply the backport to our kernel sources, so that users can be unblocked from updating. This will be released to alpha/beta first, and hopefully by the time it is promoted to stable, the fix will be merged to the kernel tree and backported to 5.15. --- .../coreos-sources-5.15.79.ebuild | 1 + ...-ext4-Fix-deadlock-due-to-mbcache-en.patch | 129 ++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-sources/files/5.15/z0008-ext4-Fix-deadlock-due-to-mbcache-en.patch diff --git a/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-sources/coreos-sources-5.15.79.ebuild b/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-sources/coreos-sources-5.15.79.ebuild index e9a1883c76..bd7c48f629 100644 --- a/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-sources/coreos-sources-5.15.79.ebuild +++ b/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-sources/coreos-sources-5.15.79.ebuild @@ -42,4 +42,5 @@ UNIPATCH_LIST=" ${PATCH_DIR}/z0005-Drivers-hv-vmbus-Propagate-VMbus-coherence-to-each-V.patch \ ${PATCH_DIR}/z0006-PCI-hv-Avoid-the-retarget-interrupt-hypercall-in-irq.patch \ ${PATCH_DIR}/z0007-PCI-hv-Remove-unused-hv_set_msi_entry_from_desc.patch \ + ${PATCH_DIR}/z0008-ext4-Fix-deadlock-due-to-mbcache-en.patch \ " diff --git a/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-sources/files/5.15/z0008-ext4-Fix-deadlock-due-to-mbcache-en.patch b/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-sources/files/5.15/z0008-ext4-Fix-deadlock-due-to-mbcache-en.patch new file mode 100644 index 0000000000..d82de52260 --- /dev/null +++ b/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-sources/files/5.15/z0008-ext4-Fix-deadlock-due-to-mbcache-en.patch @@ -0,0 +1,129 @@ +From e7ec42e181c6213d1fd71b946196f05af601ba5c Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Mon, 21 Nov 2022 15:44:10 +0100 +Subject: [PATCH] ext4: Fix deadlock due to mbcache entry corruption + +When manipulating xattr blocks, we can deadlock infinitely looping +inside ext4_xattr_block_set() where we constantly keep finding xattr +block for reuse in mbcache but we are unable to reuse it because its +reference count is too big. This happens because cache entry for the +xattr block is marked as reusable (e_reusable set) although its +reference count is too big. When this inconsistency happens, this +inconsistent state is kept indefinitely and so ext4_xattr_block_set() +keeps retrying indefinitely. + +The inconsistent state is caused by non-atomic update of e_reusable bit. +e_reusable is part of a bitfield and e_reusable update can race with +update of e_referenced bit in the same bitfield resulting in loss of one +of the updates. Fix the problem by using atomic bitops instead. + +[jeremi: backport from here https://lore.kernel.org/linux-ext4/20221122174807.GA9658@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net/] +CC: stable@vger.kernel.org +Fixes: 6048c64b2609 ("mbcache: add reusable flag to cache entries") +Reported-by: Jeremi Piotrowski +Reported-by: Thilo Fromm +Signed-off-by: Jan Kara +--- + fs/ext4/xattr.c | 4 ++-- + fs/mbcache.c | 14 ++++++++------ + include/linux/mbcache.h | 9 +++++++-- + 3 files changed, 17 insertions(+), 10 deletions(-) + +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index 533216e80fa2..22700812a4d3 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -1281,7 +1281,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, + ce = mb_cache_entry_get(ea_block_cache, hash, + bh->b_blocknr); + if (ce) { +- ce->e_reusable = 1; ++ set_bit(MBE_REUSABLE_B, &ce->e_flags); + mb_cache_entry_put(ea_block_cache, ce); + } + } +@@ -2042,7 +2042,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, + } + BHDR(new_bh)->h_refcount = cpu_to_le32(ref); + if (ref == EXT4_XATTR_REFCOUNT_MAX) +- ce->e_reusable = 0; ++ clear_bit(MBE_REUSABLE_B, &ce->e_flags); + ea_bdebug(new_bh, "reusing; refcount now=%d", + ref); + ext4_xattr_block_csum_set(inode, new_bh); +diff --git a/fs/mbcache.c b/fs/mbcache.c +index 2010bc80a3f2..ac07b50ea3df 100644 +--- a/fs/mbcache.c ++++ b/fs/mbcache.c +@@ -94,8 +94,9 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, + atomic_set(&entry->e_refcnt, 1); + entry->e_key = key; + entry->e_value = value; +- entry->e_reusable = reusable; +- entry->e_referenced = 0; ++ entry->e_flags = 0; ++ if (reusable) ++ set_bit(MBE_REUSABLE_B, &entry->e_flags); + head = mb_cache_entry_head(cache, key); + hlist_bl_lock(head); + hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { +@@ -155,7 +156,8 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache, + while (node) { + entry = hlist_bl_entry(node, struct mb_cache_entry, + e_hash_list); +- if (entry->e_key == key && entry->e_reusable) { ++ if (entry->e_key == key && ++ test_bit(MBE_REUSABLE_B, &entry->e_flags)) { + atomic_inc(&entry->e_refcnt); + goto out; + } +@@ -325,7 +327,7 @@ EXPORT_SYMBOL(mb_cache_entry_delete_or_get); + void mb_cache_entry_touch(struct mb_cache *cache, + struct mb_cache_entry *entry) + { +- entry->e_referenced = 1; ++ set_bit(MBE_REFERENCED_B, &entry->e_flags); + } + EXPORT_SYMBOL(mb_cache_entry_touch); + +@@ -350,8 +352,8 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, + while (nr_to_scan-- && !list_empty(&cache->c_list)) { + entry = list_first_entry(&cache->c_list, + struct mb_cache_entry, e_list); +- if (entry->e_referenced || atomic_read(&entry->e_refcnt) > 2) { +- entry->e_referenced = 0; ++ if (test_bit(MBE_REFERENCED_B, &entry->e_flags) || atomic_read(&entry->e_refcnt) > 2) { ++ clear_bit(MBE_REFERENCED_B, &entry->e_flags); + list_move_tail(&entry->e_list, &cache->c_list); + continue; + } +diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h +index 8eca7f25c432..62927f7e2588 100644 +--- a/include/linux/mbcache.h ++++ b/include/linux/mbcache.h +@@ -10,6 +10,12 @@ + + struct mb_cache; + ++/* Cache entry flags */ ++enum { ++ MBE_REFERENCED_B = 0, ++ MBE_REUSABLE_B ++}; ++ + struct mb_cache_entry { + /* List of entries in cache - protected by cache->c_list_lock */ + struct list_head e_list; +@@ -18,8 +24,7 @@ struct mb_cache_entry { + atomic_t e_refcnt; + /* Key in hash - stable during lifetime of the entry */ + u32 e_key; +- u32 e_referenced:1; +- u32 e_reusable:1; ++ unsigned long e_flags; + /* User provided value - stable during lifetime of the entry */ + u64 e_value; + }; +-- +2.25.1 + From b7eec9eed77d5bffd9df8933f9a7a57b82057f7b Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Tue, 6 Dec 2022 11:42:15 +0100 Subject: [PATCH 2/2] changelog: add entry for Flatcar#847 bugfix --- .../changelog/bugfixes/2022-12-06-kernel-bug-847.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 sdk_container/src/third_party/coreos-overlay/changelog/bugfixes/2022-12-06-kernel-bug-847.md diff --git a/sdk_container/src/third_party/coreos-overlay/changelog/bugfixes/2022-12-06-kernel-bug-847.md b/sdk_container/src/third_party/coreos-overlay/changelog/bugfixes/2022-12-06-kernel-bug-847.md new file mode 100644 index 0000000000..b66b0bcc0d --- /dev/null +++ b/sdk_container/src/third_party/coreos-overlay/changelog/bugfixes/2022-12-06-kernel-bug-847.md @@ -0,0 +1 @@ +- Fix "ext4 deadlock under heavy I/O load" kernel issue. The patch for this is included provisionally while we wait for it to be merged upstream ([Flatcar#847](https://github.com/flatcar/Flatcar/issues/847), [coreos-overlay#2315](https://github.com/flatcar/coreos-overlay/pull/2315))