Skip to content

Commit

Permalink
lib/radix-tree.c: make radix_tree_node_alloc() work correctly within …
Browse files Browse the repository at this point in the history
…interrupt

With users of radix_tree_preload() run from interrupt (block/blk-ioc.c is
one such possible user), the following race can happen:

radix_tree_preload()
...
radix_tree_insert()
  radix_tree_node_alloc()
    if (rtp->nr) {
      ret = rtp->nodes[rtp->nr - 1];
<interrupt>
...
radix_tree_preload()
...
radix_tree_insert()
  radix_tree_node_alloc()
    if (rtp->nr) {
      ret = rtp->nodes[rtp->nr - 1];

And we give out one radix tree node twice.  That clearly results in radix
tree corruption with different results (usually OOPS) depending on which
two users of radix tree race.

We fix the problem by making radix_tree_node_alloc() always allocate fresh
radix tree nodes when in interrupt.  Using preloading when in interrupt
doesn't make sense since all the allocations have to be atomic anyway and
we cannot steal nodes from process-context users because some users rely
on radix_tree_insert() succeeding after radix_tree_preload().
in_interrupt() check is somewhat ugly but we cannot simply key off passed
gfp_mask as that is acquired from root_gfp_mask() and thus the same for
all preload users.

Another part of the fix is to avoid node preallocation in
radix_tree_preload() when passed gfp_mask doesn't allow waiting.  Again,
preallocation in such case doesn't make sense and when preallocation would
happen in interrupt we could possibly leak some allocated nodes.  However,
some users of radix_tree_preload() require following radix_tree_insert()
to succeed.  To avoid unexpected effects for these users,
radix_tree_preload() only warns if passed gfp mask doesn't allow waiting
and we provide a new function radix_tree_maybe_preload() for those users
which get different gfp mask from different call sites and which are
prepared to handle radix_tree_insert() failure.

Signed-off-by: Jan Kara <[email protected]>
Cc: Jens Axboe <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
Change-Id: Idb94cda52278390f86ff44f4915e4c14968ea896
  • Loading branch information
jankara authored and ananjaser1211 committed Apr 20, 2021
1 parent 6493fc9 commit 053a672
Show file tree
Hide file tree
Showing 19 changed files with 1,286 additions and 1,249 deletions.
2 changes: 1 addition & 1 deletion block/blk-ioc.c
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
if (!icq)
return NULL;

if (radix_tree_preload(gfp_mask) < 0) {
if (radix_tree_maybe_preload(gfp_mask) < 0) {
kmem_cache_free(et->icq_cache, icq);
return NULL;
}
Expand Down
2 changes: 1 addition & 1 deletion drivers/block/brd.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
if (!page)
return NULL;

if (radix_tree_preload(GFP_NOIO)) {
if (radix_tree_maybe_preload(GFP_NOIO)) {
__free_page(page);
return NULL;
}
Expand Down
2 changes: 1 addition & 1 deletion fs/btrfs/delayed-inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
atomic_inc(&node->refs); /* cached in the btrfs inode */
atomic_inc(&node->refs); /* can be accessed */

ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
ret = radix_tree_maybe_preload(GFP_NOFS & ~__GFP_HIGHMEM);
if (ret) {
kmem_cache_free(delayed_node_cache, node);
return ERR_PTR(ret);
Expand Down
2 changes: 1 addition & 1 deletion fs/btrfs/disk-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -1581,7 +1581,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
if (ret == 0)
root->orphan_item_inserted = 1;

ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
ret = radix_tree_maybe_preload(GFP_NOFS & ~__GFP_HIGHMEM);
if (ret)
goto fail;

Expand Down
2 changes: 1 addition & 1 deletion fs/btrfs/extent_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -4378,7 +4378,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
again:
ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
ret = radix_tree_maybe_preload(GFP_NOFS & ~__GFP_HIGHMEM);
if (ret)
goto free_eb;

Expand Down
2 changes: 1 addition & 1 deletion fs/f2fs/checkpoint.c
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)

tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS);
retry:
radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
radix_tree_maybe_preload(GFP_NOFS | __GFP_NOFAIL);

spin_lock(&im->ino_lock);
e = radix_tree_lookup(&im->ino_root, ino);
Expand Down
2 changes: 1 addition & 1 deletion fs/f2fs/node.c
Original file line number Diff line number Diff line change
Expand Up @@ -1765,7 +1765,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
i->nid = nid;
i->state = NID_NEW;

if (radix_tree_preload(GFP_NOFS)) {
if (radix_tree_maybe_preload(GFP_NOFS)) {
kmem_cache_free(free_nid_slab, i);
return 0;
}
Expand Down
2 changes: 1 addition & 1 deletion fs/f2fs/trace.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ void f2fs_trace_pid(struct page *page)

page->private = pid;

if (radix_tree_preload(GFP_NOFS))
if (radix_tree_maybe_preload(GFP_NOFS))
return;

spin_lock(&pids_lock);
Expand Down
2 changes: 1 addition & 1 deletion fs/fscache/page.c
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
fscache_release_write_op);
op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);

ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
ret = radix_tree_maybe_preload(gfp & ~__GFP_HIGHMEM);
if (ret < 0)
goto nomem_free;

Expand Down
2 changes: 1 addition & 1 deletion fs/nilfs2/btnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
* than 2.6.23, because it is not exported for modules.
*/
retry:
err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
err = radix_tree_maybe_preload(GFP_NOFS & ~__GFP_HIGHMEM);
if (err)
goto failed_unlock;
/* BUG_ON(oldkey != obh->b_page->index); */
Expand Down
2 changes: 1 addition & 1 deletion fs/xfs/xfs_mount.c
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ xfs_initialize_perag(
spin_lock_init(&pag->pag_buf_lock);
pag->pag_buf_tree = RB_ROOT;

if (radix_tree_preload(GFP_NOFS))
if (radix_tree_maybe_preload(GFP_NOFS))
goto out_unwind;

spin_lock(&mp->m_perag_lock);
Expand Down
2 changes: 1 addition & 1 deletion fs/xfs/xfs_mru_cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ xfs_mru_cache_insert(
if (!elem)
return ENOMEM;

if (radix_tree_preload(GFP_KERNEL)) {
if (radix_tree_maybe_preload(GFP_KERNEL)) {
kmem_zone_free(xfs_mru_elem_zone, elem);
return ENOMEM;
}
Expand Down
1 change: 1 addition & 0 deletions include/linux/radix-tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root,
unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
unsigned long index, unsigned long max_scan);
int radix_tree_preload(gfp_t gfp_mask);
int radix_tree_maybe_preload(gfp_t gfp_mask);
void radix_tree_init(void);
void *radix_tree_tag_set(struct radix_tree_root *root,
unsigned long index, unsigned int tag);
Expand Down
42 changes: 39 additions & 3 deletions lib/radix-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#include <linux/string.h>
#include <linux/bitops.h>
#include <linux/rcupdate.h>

#include <linux/hardirq.h> /* in_interrupt() */

#ifdef __KERNEL__
#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
Expand Down Expand Up @@ -207,7 +207,12 @@ radix_tree_node_alloc(struct radix_tree_root *root)
struct radix_tree_node *ret = NULL;
gfp_t gfp_mask = root_gfp_mask(root);

if (!(gfp_mask & __GFP_WAIT)) {
/*
* Preload code isn't irq safe and it doesn't make sence to use
* preloading in the interrupt anyway as all the allocations have to
* be atomic. So just do normal allocation when in interrupt.
*/
if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) {
struct radix_tree_preload *rtp;

/*
Expand Down Expand Up @@ -264,7 +269,7 @@ radix_tree_node_free(struct radix_tree_node *node)
* To make use of this facility, the radix tree must be initialised without
* __GFP_WAIT being passed to INIT_RADIX_TREE().
*/
int radix_tree_preload(gfp_t gfp_mask)
static int __radix_tree_preload(gfp_t gfp_mask)
{
struct radix_tree_preload *rtp;
struct radix_tree_node *node;
Expand All @@ -288,8 +293,39 @@ int radix_tree_preload(gfp_t gfp_mask)
out:
return ret;
}

/*
* Load up this CPU's radix_tree_node buffer with sufficient objects to
* ensure that the addition of a single element in the tree cannot fail. On
* success, return zero, with preemption disabled. On error, return -ENOMEM
* with preemption not disabled.
*
* To make use of this facility, the radix tree must be initialised without
* __GFP_WAIT being passed to INIT_RADIX_TREE().
*/
int radix_tree_preload(gfp_t gfp_mask)
{
/* Warn on non-sensical use... */
WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT));
return __radix_tree_preload(gfp_mask);
}
EXPORT_SYMBOL(radix_tree_preload);

/*
* The same as above function, except we don't guarantee preloading happens.
* We do it, if we decide it helps. On success, return zero with preemption
* disabled. On error, return -ENOMEM with preemption not disabled.
*/
int radix_tree_maybe_preload(gfp_t gfp_mask)
{
if (gfp_mask & __GFP_WAIT)
return __radix_tree_preload(gfp_mask);
/* Preloading doesn't help anything with this gfp mask, skip it */
preempt_disable();
return 0;
}
EXPORT_SYMBOL(radix_tree_maybe_preload);

/*
* Return the maximum key which can be store into a
* radix tree with height HEIGHT.
Expand Down
4 changes: 2 additions & 2 deletions mm/filemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
VM_BUG_ON(!PageLocked(new));
VM_BUG_ON(new->mapping);

error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
if (!error) {
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *);
Expand Down Expand Up @@ -478,7 +478,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
if (error)
goto out;

error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
page_cache_get(page);
page->mapping = mapping;
Expand Down
2 changes: 1 addition & 1 deletion mm/shmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -1222,7 +1222,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
gfp & GFP_RECLAIM_MASK);
if (error)
goto decused;
error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
gfp, NULL);
Expand Down
6 changes: 3 additions & 3 deletions mm/swap_state.c
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
{
int error;

error = radix_tree_preload(gfp_mask);
error = radix_tree_maybe_preload(gfp_mask);
if (!error) {
error = __add_to_swap_cache(page, entry);
radix_tree_preload_end();
Expand Down Expand Up @@ -327,9 +327,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}

/*
* call radix_tree_preload() while we can wait.
* call radix_tree_maybe_preload() while we can wait.
*/
err = radix_tree_preload(gfp_mask & GFP_KERNEL);
err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
if (err)
break;

Expand Down
2 changes: 1 addition & 1 deletion mm/vmalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -856,7 +856,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
return ERR_CAST(va);
}

err = radix_tree_preload(gfp_mask);
err = radix_tree_maybe_preload(gfp_mask);
if (unlikely(err)) {
kfree(vb);
free_vmap_area(va);
Expand Down
Loading

0 comments on commit 053a672

Please sign in to comment.