]> git.dujemihanovic.xyz Git - linux.git/commitdiff
bcachefs: Rework btree node pinning
authorKent Overstreet <kent.overstreet@linux.dev>
Thu, 5 Sep 2024 00:49:37 +0000 (20:49 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sat, 21 Sep 2024 15:39:48 +0000 (11:39 -0400)
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers

Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).

Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.

Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/backpointers.c
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_cache.h
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/journal_reclaim.c
fs/bcachefs/sysfs.c

index 6c395445f60cc678a2c89d7a2ab0441eedd0162a..b88d1801652c315310e889a0e1dc753fc56c2de2 100644 (file)
@@ -752,10 +752,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
        s64 mem_may_pin = mem_may_pin_bytes(c);
        int ret = 0;
 
+       bch2_btree_cache_unpin(c);
+
        btree_interior_mask |= btree_leaf_mask;
 
-       c->btree_cache.pinned_nodes_leaf_mask           = btree_leaf_mask;
-       c->btree_cache.pinned_nodes_interior_mask       = btree_interior_mask;
+       c->btree_cache.pinned_nodes_mask[0]             = btree_leaf_mask;
+       c->btree_cache.pinned_nodes_mask[1]             = btree_interior_mask;
        c->btree_cache.pinned_nodes_start               = start;
        c->btree_cache.pinned_nodes_end                 = *end = BBPOS_MAX;
 
@@ -777,6 +779,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
                                        BBPOS(btree, b->key.k.p);
                                break;
                        }
+                       bch2_node_pin(c, b);
                        0;
                }));
        }
@@ -936,8 +939,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
        bch2_trans_put(trans);
        bch2_bkey_buf_exit(&s.last_flushed, c);
 
-       c->btree_cache.pinned_nodes_leaf_mask = 0;
-       c->btree_cache.pinned_nodes_interior_mask = 0;
+       bch2_btree_cache_unpin(c);
 
        bch_err_fn(c, ret);
        return ret;
@@ -1053,8 +1055,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
        }
        bch2_trans_put(trans);
 
-       c->btree_cache.pinned_nodes_leaf_mask = 0;
-       c->btree_cache.pinned_nodes_interior_mask = 0;
+       bch2_btree_cache_unpin(c);
 
        bch_err_fn(c, ret);
        return ret;
index e66853b9885728e159d124e0b8759ad190f50ac0..6e4afb2b54413bf67c06e1d42a3f596c358d647c 100644 (file)
@@ -47,9 +47,14 @@ void bch2_recalc_btree_reserve(struct bch_fs *c)
        c->btree_cache.nr_reserve = reserve;
 }
 
-static inline size_t btree_cache_can_free(struct btree_cache *bc)
+static inline size_t btree_cache_can_free(struct btree_cache_list *list)
 {
-       return max_t(int, 0, bc->nr_live + bc->nr_freeable - bc->nr_reserve);
+       struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+
+       size_t can_free = list->nr;
+       if (!list->idx)
+               can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
+       return can_free;
 }
 
 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
@@ -184,6 +189,51 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
        six_unlock_intent(&b->c.lock);
 }
 
+static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
+{
+       struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
+
+       u64 mask = bc->pinned_nodes_mask[!!b->c.level];
+
+       return ((mask & BIT_ULL(b->c.btree_id)) &&
+               bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
+               bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
+}
+
+void bch2_node_pin(struct bch_fs *c, struct btree *b)
+{
+       struct btree_cache *bc = &c->btree_cache;
+
+       mutex_lock(&bc->lock);
+       BUG_ON(!__btree_node_pinned(bc, b));
+       if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
+               set_btree_node_pinned(b);
+               list_move(&b->list, &bc->live[1].list);
+               bc->live[0].nr--;
+               bc->live[1].nr++;
+       }
+       mutex_unlock(&bc->lock);
+}
+
+void bch2_btree_cache_unpin(struct bch_fs *c)
+{
+       struct btree_cache *bc = &c->btree_cache;
+       struct btree *b, *n;
+
+       mutex_lock(&bc->lock);
+       c->btree_cache.pinned_nodes_mask[0] = 0;
+       c->btree_cache.pinned_nodes_mask[1] = 0;
+
+       list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
+               clear_btree_node_pinned(b);
+               list_move(&b->list, &bc->live[0].list);
+               bc->live[0].nr++;
+               bc->live[1].nr--;
+       }
+
+       mutex_unlock(&bc->lock);
+}
+
 /* Btree in memory cache - hash table */
 
 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
@@ -199,7 +249,7 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
        if (b->c.btree_id < BTREE_ID_NR)
                --bc->nr_by_btree[b->c.btree_id];
 
-       bc->nr_live--;
+       bc->live[btree_node_pinned(b)].nr--;
        bc->nr_freeable++;
        list_move(&b->list, &bc->freeable);
 }
@@ -216,9 +266,14 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
 
        if (b->c.btree_id < BTREE_ID_NR)
                bc->nr_by_btree[b->c.btree_id]++;
-       bc->nr_live++;
+
+       bool p = __btree_node_pinned(bc, b);
+       mod_bit(BTREE_NODE_pinned, &b->flags, p);
+
+       list_move_tail(&b->list, &bc->live[p].list);
+       bc->live[p].nr++;
+
        bc->nr_freeable--;
-       list_move_tail(&b->list, &bc->live);
        return 0;
 }
 
@@ -283,20 +338,6 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b
        int ret = 0;
 
        lockdep_assert_held(&bc->lock);
-
-       struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
-
-       u64 mask = b->c.level
-               ? bc->pinned_nodes_interior_mask
-               : bc->pinned_nodes_leaf_mask;
-
-       if ((mask & BIT_ULL(b->c.btree_id)) &&
-           bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
-           bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) {
-               BTREE_CACHE_NOT_FREED_INCREMENT(pinned);
-               return -BCH_ERR_ENOMEM_btree_node_reclaim;
-       }
-
 wait_on_io:
        if (b->flags & ((1U << BTREE_NODE_dirty)|
                        (1U << BTREE_NODE_read_in_flight)|
@@ -401,8 +442,9 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
                                           struct shrink_control *sc)
 {
-       struct bch_fs *c = shrink->private_data;
-       struct btree_cache *bc = &c->btree_cache;
+       struct btree_cache_list *list = shrink->private_data;
+       struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+       struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
        struct btree *b, *t;
        unsigned long nr = sc->nr_to_scan;
        unsigned long can_free = 0;
@@ -410,8 +452,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
        unsigned long touched = 0;
        unsigned i, flags;
        unsigned long ret = SHRINK_STOP;
-       bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >=
-               (bc->nr_live + bc->nr_freeable) * 3 / 4;
+       bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
 
        if (bch2_btree_shrinker_disabled)
                return SHRINK_STOP;
@@ -426,7 +467,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
         * succeed, so that inserting keys into the btree can always succeed and
         * IO can always make forward progress:
         */
-       can_free = btree_cache_can_free(bc);
+       can_free = btree_cache_can_free(list);
        nr = min_t(unsigned long, nr, can_free);
 
        i = 0;
@@ -452,7 +493,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
                }
        }
 restart:
-       list_for_each_entry_safe(b, t, &bc->live, list) {
+       list_for_each_entry_safe(b, t, &list->list, list) {
                touched++;
 
                if (btree_node_accessed(b)) {
@@ -476,7 +517,7 @@ restart:
                           !btree_node_will_make_reachable(b) &&
                           !btree_node_write_blocked(b) &&
                           six_trylock_read(&b->c.lock)) {
-                       list_move(&bc->live, &b->list);
+                       list_move(&list->list, &b->list);
                        mutex_unlock(&bc->lock);
                        __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
                        six_unlock_read(&b->c.lock);
@@ -490,8 +531,8 @@ restart:
                        break;
        }
 out_rotate:
-       if (&t->list != &bc->live)
-               list_move_tail(&bc->live, &t->list);
+       if (&t->list != &list->list)
+               list_move_tail(&list->list, &t->list);
 out:
        mutex_unlock(&bc->lock);
 out_nounlock:
@@ -504,40 +545,42 @@ out_nounlock:
 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
                                            struct shrink_control *sc)
 {
-       struct bch_fs *c = shrink->private_data;
-       struct btree_cache *bc = &c->btree_cache;
+       struct btree_cache_list *list = shrink->private_data;
 
        if (bch2_btree_shrinker_disabled)
                return 0;
 
-       return btree_cache_can_free(bc);
+       return btree_cache_can_free(list);
 }
 
 void bch2_fs_btree_cache_exit(struct bch_fs *c)
 {
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b, *t;
-       unsigned i, flags;
+       unsigned long flags;
 
-       shrinker_free(bc->shrink);
+       shrinker_free(bc->live[1].shrink);
+       shrinker_free(bc->live[0].shrink);
 
        /* vfree() can allocate memory: */
        flags = memalloc_nofs_save();
        mutex_lock(&bc->lock);
 
        if (c->verify_data)
-               list_move(&c->verify_data->list, &bc->live);
+               list_move(&c->verify_data->list, &bc->live[0].list);
 
        kvfree(c->verify_ondisk);
 
-       for (i = 0; i < btree_id_nr_alive(c); i++) {
+       for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
                struct btree_root *r = bch2_btree_id_root(c, i);
 
                if (r->b)
-                       list_add(&r->b->list, &bc->live);
+                       list_add(&r->b->list, &bc->live[0].list);
        }
 
-       list_for_each_entry_safe(b, t, &bc->live, list)
+       list_for_each_entry_safe(b, t, &bc->live[1].list, list)
+               bch2_btree_node_hash_remove(bc, b);
+       list_for_each_entry_safe(b, t, &bc->live[0].list, list)
                bch2_btree_node_hash_remove(bc, b);
 
        list_for_each_entry_safe(b, t, &bc->freeable, list) {
@@ -563,7 +606,8 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 
        for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
                BUG_ON(bc->nr_by_btree[i]);
-       BUG_ON(bc->nr_live);
+       BUG_ON(bc->live[0].nr);
+       BUG_ON(bc->live[1].nr);
        BUG_ON(bc->nr_freeable);
 
        if (bc->table_init_done)
@@ -589,18 +633,28 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
                if (!__bch2_btree_node_mem_alloc(c))
                        goto err;
 
-       list_splice_init(&bc->live, &bc->freeable);
+       list_splice_init(&bc->live[0].list, &bc->freeable);
 
        mutex_init(&c->verify_lock);
 
        shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
        if (!shrink)
                goto err;
-       bc->shrink = shrink;
+       bc->live[0].shrink      = shrink;
+       shrink->count_objects   = bch2_btree_cache_count;
+       shrink->scan_objects    = bch2_btree_cache_scan;
+       shrink->seeks           = 2;
+       shrink->private_data    = &bc->live[0];
+       shrinker_register(shrink);
+
+       shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
+       if (!shrink)
+               goto err;
+       bc->live[1].shrink      = shrink;
        shrink->count_objects   = bch2_btree_cache_count;
        shrink->scan_objects    = bch2_btree_cache_scan;
-       shrink->seeks           = 4;
-       shrink->private_data    = c;
+       shrink->seeks           = 8;
+       shrink->private_data    = &bc->live[1];
        shrinker_register(shrink);
 
        return 0;
@@ -611,7 +665,10 @@ err:
 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
 {
        mutex_init(&bc->lock);
-       INIT_LIST_HEAD(&bc->live);
+       for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
+               bc->live[i].idx = i;
+               INIT_LIST_HEAD(&bc->live[i].list);
+       }
        INIT_LIST_HEAD(&bc->freeable);
        INIT_LIST_HEAD(&bc->freed_pcpu);
        INIT_LIST_HEAD(&bc->freed_nonpcpu);
@@ -673,14 +730,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
 
-       list_for_each_entry_reverse(b, &bc->live, list)
-               if (!btree_node_reclaim(c, b, false))
-                       return b;
+       for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
+               list_for_each_entry_reverse(b, &bc->live[i].list, list)
+                       if (!btree_node_reclaim(c, b, false))
+                               return b;
 
        while (1) {
-               list_for_each_entry_reverse(b, &bc->live, list)
-                       if (!btree_node_write_and_reclaim(c, b))
-                               return b;
+               for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
+                       list_for_each_entry_reverse(b, &bc->live[i].list, list)
+                               if (!btree_node_write_and_reclaim(c, b))
+                                       return b;
 
                /*
                 * Rare case: all nodes were intent-locked.
@@ -1387,9 +1446,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
        if (!out->nr_tabstops)
                printbuf_tabstop_push(out, 32);
 
-       prt_btree_cache_line(out, c, "nr_live:",        bc->nr_live);
-       prt_btree_cache_line(out, c, "nr_freeable:",    bc->nr_freeable);
-       prt_btree_cache_line(out, c, "nr dirty:",       atomic_long_read(&bc->nr_dirty));
+       prt_btree_cache_line(out, c, "live:",           bc->live[0].nr);
+       prt_btree_cache_line(out, c, "pinned:",         bc->live[1].nr);
+       prt_btree_cache_line(out, c, "freeable:",       bc->nr_freeable);
+       prt_btree_cache_line(out, c, "dirty:",          atomic_long_read(&bc->nr_dirty));
        prt_printf(out, "cannibalize lock:\t%p\n",      bc->alloc_lock);
        prt_newline(out);
 
index f82064007127203e80033359ec4cd4010cccad79..367acd217c6a86fc7e4a1abe511954751d3a9999 100644 (file)
@@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
 int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
                                unsigned, enum btree_id);
 
+void bch2_node_pin(struct bch_fs *, struct btree *);
+void bch2_btree_cache_unpin(struct bch_fs *);
+
 void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
                                      struct bkey_s_c, struct bkey_i *);
 
index ee3df2a486cccb59f685d8f2c59ea59fd5a675ae..4568a41fefaf63bee803c169eb8b07a30d625a40 100644 (file)
@@ -147,8 +147,7 @@ struct btree {
        x(noevict)                              \
        x(write_blocked)                        \
        x(will_make_reachable)                  \
-       x(access_bit)                           \
-       x(pinned)                               \
+       x(access_bit)
 
 enum bch_btree_cache_not_freed_reasons {
 #define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
@@ -157,6 +156,13 @@ enum bch_btree_cache_not_freed_reasons {
        BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
 };
 
+struct btree_cache_list {
+       unsigned                idx;
+       struct shrinker         *shrink;
+       struct list_head        list;
+       size_t                  nr;
+};
+
 struct btree_cache {
        struct rhashtable       table;
        bool                    table_init_done;
@@ -174,12 +180,11 @@ struct btree_cache {
         * should never grow past ~2-3 nodes in practice.
         */
        struct mutex            lock;
-       struct list_head        live;
        struct list_head        freeable;
        struct list_head        freed_pcpu;
        struct list_head        freed_nonpcpu;
+       struct btree_cache_list live[2];
 
-       size_t                  nr_live;
        size_t                  nr_freeable;
        size_t                  nr_reserve;
        size_t                  nr_by_btree[BTREE_ID_NR];
@@ -188,7 +193,6 @@ struct btree_cache {
        /* shrinker stats */
        size_t                  nr_freed;
        u64                     not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
-       struct shrinker         *shrink;
 
        /*
         * If we need to allocate memory for a new btree node and that
@@ -201,8 +205,8 @@ struct btree_cache {
 
        struct bbpos            pinned_nodes_start;
        struct bbpos            pinned_nodes_end;
-       u64                     pinned_nodes_leaf_mask;
-       u64                     pinned_nodes_interior_mask;
+       /* btree id mask: 0 for leaves, 1 for interior */
+       u64                     pinned_nodes_mask[2];
 };
 
 struct btree_node_iter {
@@ -594,7 +598,8 @@ enum btree_write_type {
        x(dying)                                                        \
        x(fake)                                                         \
        x(need_rewrite)                                                 \
-       x(never_write)
+       x(never_write)                                                  \
+       x(pinned)
 
 enum btree_flags {
        /* First bits for btree node write type */
index 18494a662e0a5b7a49d472f72a2865c67b93ff50..190bc1e81756ca9c3c4f48ca41ef4e38bd8e8701 100644 (file)
@@ -1904,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
        six_unlock_intent(&n->c.lock);
 
        mutex_lock(&c->btree_cache.lock);
-       list_add_tail(&b->list, &c->btree_cache.live);
+       list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
        mutex_unlock(&c->btree_cache.lock);
 
        bch2_trans_verify_locks(trans);
index f8e0459827530e386d1bee7a44883ff3421ce902..ace291f175dd6b0f45d6fcabd3a30391b7a47911 100644 (file)
@@ -641,6 +641,7 @@ static u64 journal_seq_to_flush(struct journal *j)
 static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct btree_cache *bc = &c->btree_cache;
        bool kthread = (current->flags & PF_KTHREAD) != 0;
        u64 seq_to_flush;
        size_t min_nr, min_key_cache, nr_flushed;
@@ -681,7 +682,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
                if (j->watermark != BCH_WATERMARK_stripe)
                        min_nr = 1;
 
-               if (atomic_long_read(&c->btree_cache.nr_dirty) * 2 > c->btree_cache.nr_live)
+               size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
+               if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
                        min_nr = 1;
 
                min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
@@ -689,8 +691,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
                trace_and_count(c, journal_reclaim_start, c,
                                direct, kicked,
                                min_nr, min_key_cache,
-                               atomic_long_read(&c->btree_cache.nr_dirty),
-                               c->btree_cache.nr_live,
+                               atomic_long_read(&bc->nr_dirty), btree_cache_live,
                                atomic_long_read(&c->btree_key_cache.nr_dirty),
                                atomic_long_read(&c->btree_key_cache.nr_keys));
 
index 6791540d6a4aa2baddd72d3073bf0014d1c039a2..03e59f86f360d00b7682019ec33853e8ce0b7c1f 100644 (file)
@@ -244,14 +244,18 @@ static struct attribute sysfs_state_rw = {
 
 static size_t bch2_btree_cache_size(struct bch_fs *c)
 {
+       struct btree_cache *bc = &c->btree_cache;
        size_t ret = 0;
        struct btree *b;
 
-       mutex_lock(&c->btree_cache.lock);
-       list_for_each_entry(b, &c->btree_cache.live, list)
+       mutex_lock(&bc->lock);
+       list_for_each_entry(b, &bc->live[0].list, list)
                ret += btree_buf_bytes(b);
-
-       mutex_unlock(&c->btree_cache.lock);
+       list_for_each_entry(b, &bc->live[1].list, list)
+               ret += btree_buf_bytes(b);
+       list_for_each_entry(b, &bc->freeable, list)
+               ret += btree_buf_bytes(b);
+       mutex_unlock(&bc->lock);
        return ret;
 }
 
@@ -444,11 +448,12 @@ STORE(bch2_fs)
                return -EROFS;
 
        if (attr == &sysfs_trigger_btree_cache_shrink) {
+               struct btree_cache *bc = &c->btree_cache;
                struct shrink_control sc;
 
                sc.gfp_mask = GFP_KERNEL;
                sc.nr_to_scan = strtoul_or_return(buf);
-               c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
+               bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
        }
 
        if (attr == &sysfs_trigger_btree_key_cache_shrink) {