blob: e25133a9e9dfe7645612a641e935c6d203e4a380 [file] [log] [blame]
Googler9398cc32022-12-02 17:21:52 +08001// SPDX-License-Identifier: GPL-2.0
Googleraf606d22022-10-26 21:40:12 -07002/*
3 * Copyright (C) 2007,2008 Oracle. All rights reserved.
4 */
5
6#include <linux/sched.h>
7#include <linux/slab.h>
8#include <linux/rbtree.h>
Googler9398cc32022-12-02 17:21:52 +08009#include <linux/mm.h>
Googleraf606d22022-10-26 21:40:12 -070010#include "ctree.h"
11#include "disk-io.h"
12#include "transaction.h"
13#include "print-tree.h"
14#include "locking.h"
Googler9398cc32022-12-02 17:21:52 +080015#include "volumes.h"
16#include "qgroup.h"
Googleraf606d22022-10-26 21:40:12 -070017
18static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
19 *root, struct btrfs_path *path, int level);
Googler9398cc32022-12-02 17:21:52 +080020static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
21 const struct btrfs_key *ins_key, struct btrfs_path *path,
22 int data_size, int extend);
Googleraf606d22022-10-26 21:40:12 -070023static int push_node_left(struct btrfs_trans_handle *trans,
Googler9398cc32022-12-02 17:21:52 +080024 struct extent_buffer *dst,
Googleraf606d22022-10-26 21:40:12 -070025 struct extent_buffer *src, int empty);
26static int balance_node_right(struct btrfs_trans_handle *trans,
Googleraf606d22022-10-26 21:40:12 -070027 struct extent_buffer *dst_buf,
28 struct extent_buffer *src_buf);
29static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30 int level, int slot);
Googler9398cc32022-12-02 17:21:52 +080031
32static const struct btrfs_csums {
33 u16 size;
34 const char *name;
35} btrfs_csums[] = {
36 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
37};
38
39int btrfs_super_csum_size(const struct btrfs_super_block *s)
40{
41 u16 t = btrfs_super_csum_type(s);
42 /*
43 * csum type is validated at mount time
44 */
45 return btrfs_csums[t].size;
46}
47
48const char *btrfs_super_csum_name(u16 csum_type)
49{
50 /* csum type is validated at mount time */
51 return btrfs_csums[csum_type].name;
52}
Googleraf606d22022-10-26 21:40:12 -070053
54struct btrfs_path *btrfs_alloc_path(void)
55{
56 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
57}
58
59/*
60 * set all locked nodes in the path to blocking locks. This should
61 * be done before scheduling
62 */
63noinline void btrfs_set_path_blocking(struct btrfs_path *p)
64{
65 int i;
66 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
67 if (!p->nodes[i] || !p->locks[i])
68 continue;
Googler9398cc32022-12-02 17:21:52 +080069 /*
70 * If we currently have a spinning reader or writer lock this
71 * will bump the count of blocking holders and drop the
72 * spinlock.
73 */
74 if (p->locks[i] == BTRFS_READ_LOCK) {
75 btrfs_set_lock_blocking_read(p->nodes[i]);
Googleraf606d22022-10-26 21:40:12 -070076 p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
Googler9398cc32022-12-02 17:21:52 +080077 } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
78 btrfs_set_lock_blocking_write(p->nodes[i]);
Googleraf606d22022-10-26 21:40:12 -070079 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
Googleraf606d22022-10-26 21:40:12 -070080 }
81 }
Googleraf606d22022-10-26 21:40:12 -070082}
83
84/* this also releases the path */
85void btrfs_free_path(struct btrfs_path *p)
86{
87 if (!p)
88 return;
89 btrfs_release_path(p);
90 kmem_cache_free(btrfs_path_cachep, p);
91}
92
93/*
94 * path release drops references on the extent buffers in the path
95 * and it drops any locks held by this path
96 *
97 * It is safe to call this on paths that no locks or extent buffers held.
98 */
99noinline void btrfs_release_path(struct btrfs_path *p)
100{
101 int i;
102
103 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
104 p->slots[i] = 0;
105 if (!p->nodes[i])
106 continue;
107 if (p->locks[i]) {
108 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
109 p->locks[i] = 0;
110 }
111 free_extent_buffer(p->nodes[i]);
112 p->nodes[i] = NULL;
113 }
114}
115
116/*
117 * safely gets a reference on the root node of a tree. A lock
118 * is not taken, so a concurrent writer may put a different node
119 * at the root of the tree. See btrfs_lock_root_node for the
120 * looping required.
121 *
122 * The extent buffer returned by this has a reference taken, so
123 * it won't disappear. It may stop being the root of the tree
124 * at any time because there are no locks held.
125 */
126struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
127{
128 struct extent_buffer *eb;
129
130 while (1) {
131 rcu_read_lock();
132 eb = rcu_dereference(root->node);
133
134 /*
135 * RCU really hurts here, we could free up the root node because
136 * it was COWed but we may not get the new root node yet so do
137 * the inc_not_zero dance and if it doesn't work then
138 * synchronize_rcu and try again.
139 */
140 if (atomic_inc_not_zero(&eb->refs)) {
141 rcu_read_unlock();
142 break;
143 }
144 rcu_read_unlock();
145 synchronize_rcu();
146 }
147 return eb;
148}
149
150/* loop around taking references on and locking the root node of the
151 * tree until you end up with a lock on the root. A locked buffer
152 * is returned, with a reference held.
153 */
154struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
155{
156 struct extent_buffer *eb;
157
158 while (1) {
159 eb = btrfs_root_node(root);
160 btrfs_tree_lock(eb);
161 if (eb == root->node)
162 break;
163 btrfs_tree_unlock(eb);
164 free_extent_buffer(eb);
165 }
166 return eb;
167}
168
169/* loop around taking references on and locking the root node of the
170 * tree until you end up with a lock on the root. A locked buffer
171 * is returned, with a reference held.
172 */
Googler9398cc32022-12-02 17:21:52 +0800173struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
Googleraf606d22022-10-26 21:40:12 -0700174{
175 struct extent_buffer *eb;
176
177 while (1) {
178 eb = btrfs_root_node(root);
179 btrfs_tree_read_lock(eb);
180 if (eb == root->node)
181 break;
182 btrfs_tree_read_unlock(eb);
183 free_extent_buffer(eb);
184 }
185 return eb;
186}
187
188/* cowonly root (everything not a reference counted cow subvolume), just get
189 * put onto a simple dirty list. transaction.c walks this to make sure they
190 * get properly updated on disk.
191 */
192static void add_root_to_dirty_list(struct btrfs_root *root)
193{
Googler9398cc32022-12-02 17:21:52 +0800194 struct btrfs_fs_info *fs_info = root->fs_info;
195
Googleraf606d22022-10-26 21:40:12 -0700196 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
197 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
198 return;
199
Googler9398cc32022-12-02 17:21:52 +0800200 spin_lock(&fs_info->trans_lock);
Googleraf606d22022-10-26 21:40:12 -0700201 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
202 /* Want the extent tree to be the last on the list */
Googler9398cc32022-12-02 17:21:52 +0800203 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
Googleraf606d22022-10-26 21:40:12 -0700204 list_move_tail(&root->dirty_list,
Googler9398cc32022-12-02 17:21:52 +0800205 &fs_info->dirty_cowonly_roots);
Googleraf606d22022-10-26 21:40:12 -0700206 else
207 list_move(&root->dirty_list,
Googler9398cc32022-12-02 17:21:52 +0800208 &fs_info->dirty_cowonly_roots);
Googleraf606d22022-10-26 21:40:12 -0700209 }
Googler9398cc32022-12-02 17:21:52 +0800210 spin_unlock(&fs_info->trans_lock);
Googleraf606d22022-10-26 21:40:12 -0700211}
212
213/*
214 * used by snapshot creation to make a copy of a root for a tree with
215 * a given objectid. The buffer with the new root node is returned in
216 * cow_ret, and this func returns zero on success or a negative error code.
217 */
218int btrfs_copy_root(struct btrfs_trans_handle *trans,
219 struct btrfs_root *root,
220 struct extent_buffer *buf,
221 struct extent_buffer **cow_ret, u64 new_root_objectid)
222{
Googler9398cc32022-12-02 17:21:52 +0800223 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -0700224 struct extent_buffer *cow;
225 int ret = 0;
226 int level;
227 struct btrfs_disk_key disk_key;
228
229 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
Googler9398cc32022-12-02 17:21:52 +0800230 trans->transid != fs_info->running_transaction->transid);
Googleraf606d22022-10-26 21:40:12 -0700231 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
232 trans->transid != root->last_trans);
233
234 level = btrfs_header_level(buf);
235 if (level == 0)
236 btrfs_item_key(buf, &disk_key, 0);
237 else
238 btrfs_node_key(buf, &disk_key, 0);
239
240 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
241 &disk_key, level, buf->start, 0);
242 if (IS_ERR(cow))
243 return PTR_ERR(cow);
244
Googler9398cc32022-12-02 17:21:52 +0800245 copy_extent_buffer_full(cow, buf);
Googleraf606d22022-10-26 21:40:12 -0700246 btrfs_set_header_bytenr(cow, cow->start);
247 btrfs_set_header_generation(cow, trans->transid);
248 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
249 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
250 BTRFS_HEADER_FLAG_RELOC);
251 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
252 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
253 else
254 btrfs_set_header_owner(cow, new_root_objectid);
255
Googler9398cc32022-12-02 17:21:52 +0800256 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
Googleraf606d22022-10-26 21:40:12 -0700257
258 WARN_ON(btrfs_header_generation(buf) > trans->transid);
259 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
260 ret = btrfs_inc_ref(trans, root, cow, 1);
261 else
262 ret = btrfs_inc_ref(trans, root, cow, 0);
Googlerb48fa912023-03-17 12:40:29 +0530263
264 if (ret)
Googleraf606d22022-10-26 21:40:12 -0700265 return ret;
Googleraf606d22022-10-26 21:40:12 -0700266
267 btrfs_mark_buffer_dirty(cow);
268 *cow_ret = cow;
269 return 0;
270}
271
272enum mod_log_op {
273 MOD_LOG_KEY_REPLACE,
274 MOD_LOG_KEY_ADD,
275 MOD_LOG_KEY_REMOVE,
276 MOD_LOG_KEY_REMOVE_WHILE_FREEING,
277 MOD_LOG_KEY_REMOVE_WHILE_MOVING,
278 MOD_LOG_MOVE_KEYS,
279 MOD_LOG_ROOT_REPLACE,
280};
281
282struct tree_mod_root {
283 u64 logical;
284 u8 level;
285};
286
287struct tree_mod_elem {
288 struct rb_node node;
289 u64 logical;
290 u64 seq;
291 enum mod_log_op op;
292
293 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
294 int slot;
295
296 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
297 u64 generation;
298
299 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
300 struct btrfs_disk_key key;
301 u64 blockptr;
302
303 /* this is used for op == MOD_LOG_MOVE_KEYS */
Googler9398cc32022-12-02 17:21:52 +0800304 struct {
305 int dst_slot;
306 int nr_items;
307 } move;
Googleraf606d22022-10-26 21:40:12 -0700308
309 /* this is used for op == MOD_LOG_ROOT_REPLACE */
310 struct tree_mod_root old_root;
311};
312
313/*
314 * Pull a new tree mod seq number for our operation.
315 */
316static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
317{
318 return atomic64_inc_return(&fs_info->tree_mod_seq);
319}
320
321/*
322 * This adds a new blocker to the tree mod log's blocker list if the @elem
323 * passed does not already have a sequence number set. So when a caller expects
324 * to record tree modifications, it should ensure to set elem->seq to zero
325 * before calling btrfs_get_tree_mod_seq.
326 * Returns a fresh, unused tree log modification sequence number, even if no new
327 * blocker was added.
328 */
329u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
330 struct seq_list *elem)
331{
332 write_lock(&fs_info->tree_mod_log_lock);
333 if (!elem->seq) {
334 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
335 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
336 }
337 write_unlock(&fs_info->tree_mod_log_lock);
338
339 return elem->seq;
340}
341
342void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
343 struct seq_list *elem)
344{
345 struct rb_root *tm_root;
346 struct rb_node *node;
347 struct rb_node *next;
348 struct seq_list *cur_elem;
349 struct tree_mod_elem *tm;
350 u64 min_seq = (u64)-1;
351 u64 seq_putting = elem->seq;
352
353 if (!seq_putting)
354 return;
355
356 write_lock(&fs_info->tree_mod_log_lock);
357 list_del(&elem->list);
358 elem->seq = 0;
359
360 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
361 if (cur_elem->seq < min_seq) {
362 if (seq_putting > cur_elem->seq) {
363 /*
364 * blocker with lower sequence number exists, we
365 * cannot remove anything from the log
366 */
367 write_unlock(&fs_info->tree_mod_log_lock);
368 return;
369 }
370 min_seq = cur_elem->seq;
371 }
372 }
373
374 /*
375 * anything that's lower than the lowest existing (read: blocked)
376 * sequence number can be removed from the tree.
377 */
378 tm_root = &fs_info->tree_mod_log;
379 for (node = rb_first(tm_root); node; node = next) {
380 next = rb_next(node);
Googler9398cc32022-12-02 17:21:52 +0800381 tm = rb_entry(node, struct tree_mod_elem, node);
Googleraf606d22022-10-26 21:40:12 -0700382 if (tm->seq >= min_seq)
383 continue;
384 rb_erase(node, tm_root);
385 kfree(tm);
386 }
387 write_unlock(&fs_info->tree_mod_log_lock);
388}
389
390/*
391 * key order of the log:
392 * node/leaf start address -> sequence
393 *
394 * The 'start address' is the logical address of the *new* root node
395 * for root replace operations, or the logical address of the affected
396 * block for all other operations.
Googleraf606d22022-10-26 21:40:12 -0700397 */
398static noinline int
399__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
400{
401 struct rb_root *tm_root;
402 struct rb_node **new;
403 struct rb_node *parent = NULL;
404 struct tree_mod_elem *cur;
405
Googler9398cc32022-12-02 17:21:52 +0800406 lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
Googler9726be62022-12-14 05:53:31 +0000407
Googleraf606d22022-10-26 21:40:12 -0700408 tm->seq = btrfs_inc_tree_mod_seq(fs_info);
409
410 tm_root = &fs_info->tree_mod_log;
411 new = &tm_root->rb_node;
412 while (*new) {
Googler9398cc32022-12-02 17:21:52 +0800413 cur = rb_entry(*new, struct tree_mod_elem, node);
Googleraf606d22022-10-26 21:40:12 -0700414 parent = *new;
415 if (cur->logical < tm->logical)
416 new = &((*new)->rb_left);
417 else if (cur->logical > tm->logical)
418 new = &((*new)->rb_right);
419 else if (cur->seq < tm->seq)
420 new = &((*new)->rb_left);
421 else if (cur->seq > tm->seq)
422 new = &((*new)->rb_right);
423 else
424 return -EEXIST;
425 }
426
427 rb_link_node(&tm->node, parent, new);
428 rb_insert_color(&tm->node, tm_root);
429 return 0;
430}
431
432/*
433 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
434 * returns zero with the tree_mod_log_lock acquired. The caller must hold
435 * this until all tree mod log insertions are recorded in the rb tree and then
436 * write unlock fs_info::tree_mod_log_lock.
437 */
438static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
439 struct extent_buffer *eb) {
440 smp_mb();
441 if (list_empty(&(fs_info)->tree_mod_seq_list))
442 return 1;
443 if (eb && btrfs_header_level(eb) == 0)
444 return 1;
445
446 write_lock(&fs_info->tree_mod_log_lock);
447 if (list_empty(&(fs_info)->tree_mod_seq_list)) {
448 write_unlock(&fs_info->tree_mod_log_lock);
449 return 1;
450 }
451
452 return 0;
453}
454
455/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
456static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
457 struct extent_buffer *eb)
458{
459 smp_mb();
460 if (list_empty(&(fs_info)->tree_mod_seq_list))
461 return 0;
462 if (eb && btrfs_header_level(eb) == 0)
463 return 0;
464
465 return 1;
466}
467
468static struct tree_mod_elem *
469alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
470 enum mod_log_op op, gfp_t flags)
471{
472 struct tree_mod_elem *tm;
473
474 tm = kzalloc(sizeof(*tm), flags);
475 if (!tm)
476 return NULL;
477
478 tm->logical = eb->start;
479 if (op != MOD_LOG_KEY_ADD) {
480 btrfs_node_key(eb, &tm->key, slot);
481 tm->blockptr = btrfs_node_blockptr(eb, slot);
482 }
483 tm->op = op;
484 tm->slot = slot;
485 tm->generation = btrfs_node_ptr_generation(eb, slot);
486 RB_CLEAR_NODE(&tm->node);
487
488 return tm;
489}
490
Googler9398cc32022-12-02 17:21:52 +0800491static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
492 enum mod_log_op op, gfp_t flags)
Googleraf606d22022-10-26 21:40:12 -0700493{
494 struct tree_mod_elem *tm;
495 int ret;
496
Googler9398cc32022-12-02 17:21:52 +0800497 if (!tree_mod_need_log(eb->fs_info, eb))
Googleraf606d22022-10-26 21:40:12 -0700498 return 0;
499
500 tm = alloc_tree_mod_elem(eb, slot, op, flags);
501 if (!tm)
502 return -ENOMEM;
503
Googler9398cc32022-12-02 17:21:52 +0800504 if (tree_mod_dont_log(eb->fs_info, eb)) {
Googleraf606d22022-10-26 21:40:12 -0700505 kfree(tm);
506 return 0;
507 }
508
Googler9398cc32022-12-02 17:21:52 +0800509 ret = __tree_mod_log_insert(eb->fs_info, tm);
Googleraf606d22022-10-26 21:40:12 -0700510 write_unlock(&eb->fs_info->tree_mod_log_lock);
511 if (ret)
512 kfree(tm);
513
514 return ret;
515}
516
Googler9398cc32022-12-02 17:21:52 +0800517static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
518 int dst_slot, int src_slot, int nr_items)
Googleraf606d22022-10-26 21:40:12 -0700519{
520 struct tree_mod_elem *tm = NULL;
521 struct tree_mod_elem **tm_list = NULL;
522 int ret = 0;
523 int i;
524 int locked = 0;
525
Googler9398cc32022-12-02 17:21:52 +0800526 if (!tree_mod_need_log(eb->fs_info, eb))
Googleraf606d22022-10-26 21:40:12 -0700527 return 0;
528
Googler9398cc32022-12-02 17:21:52 +0800529 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
Googleraf606d22022-10-26 21:40:12 -0700530 if (!tm_list)
531 return -ENOMEM;
532
Googler9398cc32022-12-02 17:21:52 +0800533 tm = kzalloc(sizeof(*tm), GFP_NOFS);
Googleraf606d22022-10-26 21:40:12 -0700534 if (!tm) {
535 ret = -ENOMEM;
536 goto free_tms;
537 }
538
539 tm->logical = eb->start;
540 tm->slot = src_slot;
541 tm->move.dst_slot = dst_slot;
542 tm->move.nr_items = nr_items;
543 tm->op = MOD_LOG_MOVE_KEYS;
544
545 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
546 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
Googler9398cc32022-12-02 17:21:52 +0800547 MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
Googleraf606d22022-10-26 21:40:12 -0700548 if (!tm_list[i]) {
549 ret = -ENOMEM;
550 goto free_tms;
551 }
552 }
553
Googler9398cc32022-12-02 17:21:52 +0800554 if (tree_mod_dont_log(eb->fs_info, eb))
Googleraf606d22022-10-26 21:40:12 -0700555 goto free_tms;
556 locked = 1;
557
558 /*
559 * When we override something during the move, we log these removals.
560 * This can only happen when we move towards the beginning of the
561 * buffer, i.e. dst_slot < src_slot.
562 */
563 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
Googler9398cc32022-12-02 17:21:52 +0800564 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
Googleraf606d22022-10-26 21:40:12 -0700565 if (ret)
566 goto free_tms;
567 }
568
Googler9398cc32022-12-02 17:21:52 +0800569 ret = __tree_mod_log_insert(eb->fs_info, tm);
Googleraf606d22022-10-26 21:40:12 -0700570 if (ret)
571 goto free_tms;
572 write_unlock(&eb->fs_info->tree_mod_log_lock);
573 kfree(tm_list);
574
575 return 0;
576free_tms:
577 for (i = 0; i < nr_items; i++) {
578 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
Googler9398cc32022-12-02 17:21:52 +0800579 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
Googleraf606d22022-10-26 21:40:12 -0700580 kfree(tm_list[i]);
581 }
582 if (locked)
583 write_unlock(&eb->fs_info->tree_mod_log_lock);
584 kfree(tm_list);
585 kfree(tm);
586
587 return ret;
588}
589
590static inline int
591__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
592 struct tree_mod_elem **tm_list,
593 int nritems)
594{
595 int i, j;
596 int ret;
597
598 for (i = nritems - 1; i >= 0; i--) {
599 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
600 if (ret) {
601 for (j = nritems - 1; j > i; j--)
602 rb_erase(&tm_list[j]->node,
603 &fs_info->tree_mod_log);
604 return ret;
605 }
606 }
607
608 return 0;
609}
610
Googler9398cc32022-12-02 17:21:52 +0800611static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
612 struct extent_buffer *new_root, int log_removal)
Googleraf606d22022-10-26 21:40:12 -0700613{
Googler9398cc32022-12-02 17:21:52 +0800614 struct btrfs_fs_info *fs_info = old_root->fs_info;
Googleraf606d22022-10-26 21:40:12 -0700615 struct tree_mod_elem *tm = NULL;
616 struct tree_mod_elem **tm_list = NULL;
617 int nritems = 0;
618 int ret = 0;
619 int i;
620
621 if (!tree_mod_need_log(fs_info, NULL))
622 return 0;
623
624 if (log_removal && btrfs_header_level(old_root) > 0) {
625 nritems = btrfs_header_nritems(old_root);
626 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
Googler9398cc32022-12-02 17:21:52 +0800627 GFP_NOFS);
Googleraf606d22022-10-26 21:40:12 -0700628 if (!tm_list) {
629 ret = -ENOMEM;
630 goto free_tms;
631 }
632 for (i = 0; i < nritems; i++) {
633 tm_list[i] = alloc_tree_mod_elem(old_root, i,
Googler9398cc32022-12-02 17:21:52 +0800634 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
Googleraf606d22022-10-26 21:40:12 -0700635 if (!tm_list[i]) {
636 ret = -ENOMEM;
637 goto free_tms;
638 }
639 }
640 }
641
Googler9398cc32022-12-02 17:21:52 +0800642 tm = kzalloc(sizeof(*tm), GFP_NOFS);
Googleraf606d22022-10-26 21:40:12 -0700643 if (!tm) {
644 ret = -ENOMEM;
645 goto free_tms;
646 }
647
648 tm->logical = new_root->start;
649 tm->old_root.logical = old_root->start;
650 tm->old_root.level = btrfs_header_level(old_root);
651 tm->generation = btrfs_header_generation(old_root);
652 tm->op = MOD_LOG_ROOT_REPLACE;
653
654 if (tree_mod_dont_log(fs_info, NULL))
655 goto free_tms;
656
657 if (tm_list)
658 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
659 if (!ret)
660 ret = __tree_mod_log_insert(fs_info, tm);
661
662 write_unlock(&fs_info->tree_mod_log_lock);
663 if (ret)
664 goto free_tms;
665 kfree(tm_list);
666
667 return ret;
668
669free_tms:
670 if (tm_list) {
671 for (i = 0; i < nritems; i++)
672 kfree(tm_list[i]);
673 kfree(tm_list);
674 }
675 kfree(tm);
676
677 return ret;
678}
679
680static struct tree_mod_elem *
681__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
682 int smallest)
683{
684 struct rb_root *tm_root;
685 struct rb_node *node;
686 struct tree_mod_elem *cur = NULL;
687 struct tree_mod_elem *found = NULL;
688
689 read_lock(&fs_info->tree_mod_log_lock);
690 tm_root = &fs_info->tree_mod_log;
691 node = tm_root->rb_node;
692 while (node) {
Googler9398cc32022-12-02 17:21:52 +0800693 cur = rb_entry(node, struct tree_mod_elem, node);
Googleraf606d22022-10-26 21:40:12 -0700694 if (cur->logical < start) {
695 node = node->rb_left;
696 } else if (cur->logical > start) {
697 node = node->rb_right;
698 } else if (cur->seq < min_seq) {
699 node = node->rb_left;
700 } else if (!smallest) {
701 /* we want the node with the highest seq */
702 if (found)
703 BUG_ON(found->seq > cur->seq);
704 found = cur;
705 node = node->rb_left;
706 } else if (cur->seq > min_seq) {
707 /* we want the node with the smallest seq */
708 if (found)
709 BUG_ON(found->seq < cur->seq);
710 found = cur;
711 node = node->rb_right;
712 } else {
713 found = cur;
714 break;
715 }
716 }
717 read_unlock(&fs_info->tree_mod_log_lock);
718
719 return found;
720}
721
722/*
723 * this returns the element from the log with the smallest time sequence
724 * value that's in the log (the oldest log item). any element with a time
725 * sequence lower than min_seq will be ignored.
726 */
727static struct tree_mod_elem *
728tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
729 u64 min_seq)
730{
731 return __tree_mod_log_search(fs_info, start, min_seq, 1);
732}
733
734/*
735 * this returns the element from the log with the largest time sequence
736 * value that's in the log (the most recent log item). any element with
737 * a time sequence lower than min_seq will be ignored.
738 */
739static struct tree_mod_elem *
740tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
741{
742 return __tree_mod_log_search(fs_info, start, min_seq, 0);
743}
744
Googler9398cc32022-12-02 17:21:52 +0800745static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
Googleraf606d22022-10-26 21:40:12 -0700746 struct extent_buffer *src, unsigned long dst_offset,
747 unsigned long src_offset, int nr_items)
748{
Googler9398cc32022-12-02 17:21:52 +0800749 struct btrfs_fs_info *fs_info = dst->fs_info;
Googleraf606d22022-10-26 21:40:12 -0700750 int ret = 0;
751 struct tree_mod_elem **tm_list = NULL;
752 struct tree_mod_elem **tm_list_add, **tm_list_rem;
753 int i;
754 int locked = 0;
755
756 if (!tree_mod_need_log(fs_info, NULL))
757 return 0;
758
759 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
760 return 0;
761
762 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
763 GFP_NOFS);
764 if (!tm_list)
765 return -ENOMEM;
766
767 tm_list_add = tm_list;
768 tm_list_rem = tm_list + nr_items;
769 for (i = 0; i < nr_items; i++) {
770 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
771 MOD_LOG_KEY_REMOVE, GFP_NOFS);
772 if (!tm_list_rem[i]) {
773 ret = -ENOMEM;
774 goto free_tms;
775 }
776
777 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
778 MOD_LOG_KEY_ADD, GFP_NOFS);
779 if (!tm_list_add[i]) {
780 ret = -ENOMEM;
781 goto free_tms;
782 }
783 }
784
785 if (tree_mod_dont_log(fs_info, NULL))
786 goto free_tms;
787 locked = 1;
788
789 for (i = 0; i < nr_items; i++) {
790 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
791 if (ret)
792 goto free_tms;
793 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
794 if (ret)
795 goto free_tms;
796 }
797
798 write_unlock(&fs_info->tree_mod_log_lock);
799 kfree(tm_list);
800
801 return 0;
802
803free_tms:
804 for (i = 0; i < nr_items * 2; i++) {
805 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
806 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
807 kfree(tm_list[i]);
808 }
809 if (locked)
810 write_unlock(&fs_info->tree_mod_log_lock);
811 kfree(tm_list);
812
813 return ret;
814}
815
Googler9398cc32022-12-02 17:21:52 +0800816static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
Googleraf606d22022-10-26 21:40:12 -0700817{
818 struct tree_mod_elem **tm_list = NULL;
819 int nritems = 0;
820 int i;
821 int ret = 0;
822
823 if (btrfs_header_level(eb) == 0)
824 return 0;
825
Googler9398cc32022-12-02 17:21:52 +0800826 if (!tree_mod_need_log(eb->fs_info, NULL))
Googleraf606d22022-10-26 21:40:12 -0700827 return 0;
828
829 nritems = btrfs_header_nritems(eb);
830 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
831 if (!tm_list)
832 return -ENOMEM;
833
834 for (i = 0; i < nritems; i++) {
835 tm_list[i] = alloc_tree_mod_elem(eb, i,
836 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
837 if (!tm_list[i]) {
838 ret = -ENOMEM;
839 goto free_tms;
840 }
841 }
842
Googler9398cc32022-12-02 17:21:52 +0800843 if (tree_mod_dont_log(eb->fs_info, eb))
Googleraf606d22022-10-26 21:40:12 -0700844 goto free_tms;
845
Googler9398cc32022-12-02 17:21:52 +0800846 ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
Googleraf606d22022-10-26 21:40:12 -0700847 write_unlock(&eb->fs_info->tree_mod_log_lock);
848 if (ret)
849 goto free_tms;
850 kfree(tm_list);
851
852 return 0;
853
854free_tms:
855 for (i = 0; i < nritems; i++)
856 kfree(tm_list[i]);
857 kfree(tm_list);
858
859 return ret;
860}
861
862/*
863 * check if the tree block can be shared by multiple trees
864 */
865int btrfs_block_can_be_shared(struct btrfs_root *root,
866 struct extent_buffer *buf)
867{
868 /*
869 * Tree blocks not in reference counted trees and tree roots
870 * are never shared. If a block was allocated after the last
871 * snapshot and the block was not allocated by tree relocation,
872 * we know the block is not shared.
873 */
874 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
875 buf != root->node && buf != root->commit_root &&
876 (btrfs_header_generation(buf) <=
877 btrfs_root_last_snapshot(&root->root_item) ||
878 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
879 return 1;
Googler9398cc32022-12-02 17:21:52 +0800880
Googleraf606d22022-10-26 21:40:12 -0700881 return 0;
882}
883
884static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
885 struct btrfs_root *root,
886 struct extent_buffer *buf,
887 struct extent_buffer *cow,
888 int *last_ref)
889{
Googler9398cc32022-12-02 17:21:52 +0800890 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -0700891 u64 refs;
892 u64 owner;
893 u64 flags;
894 u64 new_flags = 0;
895 int ret;
896
897 /*
898 * Backrefs update rules:
899 *
900 * Always use full backrefs for extent pointers in tree block
901 * allocated by tree relocation.
902 *
903 * If a shared tree block is no longer referenced by its owner
904 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
905 * use full backrefs for extent pointers in tree block.
906 *
907 * If a tree block is been relocating
908 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
909 * use full backrefs for extent pointers in tree block.
910 * The reason for this is some operations (such as drop tree)
911 * are only allowed for blocks use full backrefs.
912 */
913
914 if (btrfs_block_can_be_shared(root, buf)) {
Googler9398cc32022-12-02 17:21:52 +0800915 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
Googleraf606d22022-10-26 21:40:12 -0700916 btrfs_header_level(buf), 1,
917 &refs, &flags);
918 if (ret)
919 return ret;
920 if (refs == 0) {
921 ret = -EROFS;
Googler9398cc32022-12-02 17:21:52 +0800922 btrfs_handle_fs_error(fs_info, ret, NULL);
Googleraf606d22022-10-26 21:40:12 -0700923 return ret;
924 }
925 } else {
926 refs = 1;
927 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
928 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
929 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
930 else
931 flags = 0;
932 }
933
934 owner = btrfs_header_owner(buf);
935 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
936 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
937
938 if (refs > 1) {
939 if ((owner == root->root_key.objectid ||
940 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
941 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
942 ret = btrfs_inc_ref(trans, root, buf, 1);
Googler9398cc32022-12-02 17:21:52 +0800943 if (ret)
944 return ret;
Googleraf606d22022-10-26 21:40:12 -0700945
946 if (root->root_key.objectid ==
947 BTRFS_TREE_RELOC_OBJECTID) {
948 ret = btrfs_dec_ref(trans, root, buf, 0);
Googler9398cc32022-12-02 17:21:52 +0800949 if (ret)
950 return ret;
Googleraf606d22022-10-26 21:40:12 -0700951 ret = btrfs_inc_ref(trans, root, cow, 1);
Googler9398cc32022-12-02 17:21:52 +0800952 if (ret)
953 return ret;
Googleraf606d22022-10-26 21:40:12 -0700954 }
955 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
956 } else {
957
958 if (root->root_key.objectid ==
959 BTRFS_TREE_RELOC_OBJECTID)
960 ret = btrfs_inc_ref(trans, root, cow, 1);
961 else
962 ret = btrfs_inc_ref(trans, root, cow, 0);
Googler9398cc32022-12-02 17:21:52 +0800963 if (ret)
964 return ret;
Googleraf606d22022-10-26 21:40:12 -0700965 }
966 if (new_flags != 0) {
967 int level = btrfs_header_level(buf);
968
Googler9398cc32022-12-02 17:21:52 +0800969 ret = btrfs_set_disk_extent_flags(trans,
Googleraf606d22022-10-26 21:40:12 -0700970 buf->start,
971 buf->len,
972 new_flags, level, 0);
973 if (ret)
974 return ret;
975 }
976 } else {
977 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
978 if (root->root_key.objectid ==
979 BTRFS_TREE_RELOC_OBJECTID)
980 ret = btrfs_inc_ref(trans, root, cow, 1);
981 else
982 ret = btrfs_inc_ref(trans, root, cow, 0);
Googler9398cc32022-12-02 17:21:52 +0800983 if (ret)
984 return ret;
Googleraf606d22022-10-26 21:40:12 -0700985 ret = btrfs_dec_ref(trans, root, buf, 1);
Googler9398cc32022-12-02 17:21:52 +0800986 if (ret)
987 return ret;
Googleraf606d22022-10-26 21:40:12 -0700988 }
Googler9398cc32022-12-02 17:21:52 +0800989 btrfs_clean_tree_block(buf);
Googleraf606d22022-10-26 21:40:12 -0700990 *last_ref = 1;
991 }
992 return 0;
993}
994
Googler9398cc32022-12-02 17:21:52 +0800995static struct extent_buffer *alloc_tree_block_no_bg_flush(
996 struct btrfs_trans_handle *trans,
997 struct btrfs_root *root,
998 u64 parent_start,
999 const struct btrfs_disk_key *disk_key,
1000 int level,
1001 u64 hint,
1002 u64 empty_size)
1003{
1004 struct btrfs_fs_info *fs_info = root->fs_info;
1005 struct extent_buffer *ret;
1006
1007 /*
1008 * If we are COWing a node/leaf from the extent, chunk, device or free
1009 * space trees, make sure that we do not finish block group creation of
1010 * pending block groups. We do this to avoid a deadlock.
1011 * COWing can result in allocation of a new chunk, and flushing pending
1012 * block groups (btrfs_create_pending_block_groups()) can be triggered
1013 * when finishing allocation of a new chunk. Creation of a pending block
1014 * group modifies the extent, chunk, device and free space trees,
1015 * therefore we could deadlock with ourselves since we are holding a
1016 * lock on an extent buffer that btrfs_create_pending_block_groups() may
1017 * try to COW later.
1018 * For similar reasons, we also need to delay flushing pending block
1019 * groups when splitting a leaf or node, from one of those trees, since
1020 * we are holding a write lock on it and its parent or when inserting a
1021 * new root node for one of those trees.
1022 */
1023 if (root == fs_info->extent_root ||
1024 root == fs_info->chunk_root ||
1025 root == fs_info->dev_root ||
1026 root == fs_info->free_space_root)
1027 trans->can_flush_pending_bgs = false;
1028
1029 ret = btrfs_alloc_tree_block(trans, root, parent_start,
1030 root->root_key.objectid, disk_key, level,
1031 hint, empty_size);
1032 trans->can_flush_pending_bgs = true;
1033
1034 return ret;
1035}
1036
Googleraf606d22022-10-26 21:40:12 -07001037/*
1038 * does the dirty work in cow of a single block. The parent block (if
1039 * supplied) is updated to point to the new cow copy. The new buffer is marked
1040 * dirty and returned locked. If you modify the block it needs to be marked
1041 * dirty again.
1042 *
1043 * search_start -- an allocation hint for the new block
1044 *
1045 * empty_size -- a hint that you plan on doing more cow. This is the size in
1046 * bytes the allocator should try to find free next to the block it returns.
1047 * This is just a hint and may be ignored by the allocator.
1048 */
1049static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1050 struct btrfs_root *root,
1051 struct extent_buffer *buf,
1052 struct extent_buffer *parent, int parent_slot,
1053 struct extent_buffer **cow_ret,
1054 u64 search_start, u64 empty_size)
1055{
Googler9398cc32022-12-02 17:21:52 +08001056 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -07001057 struct btrfs_disk_key disk_key;
1058 struct extent_buffer *cow;
1059 int level, ret;
1060 int last_ref = 0;
1061 int unlock_orig = 0;
1062 u64 parent_start = 0;
1063
1064 if (*cow_ret == buf)
1065 unlock_orig = 1;
1066
1067 btrfs_assert_tree_locked(buf);
1068
1069 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
Googler9398cc32022-12-02 17:21:52 +08001070 trans->transid != fs_info->running_transaction->transid);
Googleraf606d22022-10-26 21:40:12 -07001071 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1072 trans->transid != root->last_trans);
1073
1074 level = btrfs_header_level(buf);
1075
1076 if (level == 0)
1077 btrfs_item_key(buf, &disk_key, 0);
1078 else
1079 btrfs_node_key(buf, &disk_key, 0);
1080
1081 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1082 parent_start = parent->start;
1083
Googler9398cc32022-12-02 17:21:52 +08001084 cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1085 level, search_start, empty_size);
Googleraf606d22022-10-26 21:40:12 -07001086 if (IS_ERR(cow))
1087 return PTR_ERR(cow);
1088
1089 /* cow is set to blocking by btrfs_init_new_buffer */
1090
Googler9398cc32022-12-02 17:21:52 +08001091 copy_extent_buffer_full(cow, buf);
Googleraf606d22022-10-26 21:40:12 -07001092 btrfs_set_header_bytenr(cow, cow->start);
1093 btrfs_set_header_generation(cow, trans->transid);
1094 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1095 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1096 BTRFS_HEADER_FLAG_RELOC);
1097 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1098 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1099 else
1100 btrfs_set_header_owner(cow, root->root_key.objectid);
1101
Googler9398cc32022-12-02 17:21:52 +08001102 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
Googleraf606d22022-10-26 21:40:12 -07001103
1104 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1105 if (ret) {
1106 btrfs_tree_unlock(cow);
1107 free_extent_buffer(cow);
1108 btrfs_abort_transaction(trans, ret);
1109 return ret;
1110 }
1111
1112 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1113 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1114 if (ret) {
1115 btrfs_tree_unlock(cow);
1116 free_extent_buffer(cow);
1117 btrfs_abort_transaction(trans, ret);
1118 return ret;
1119 }
1120 }
1121
1122 if (buf == root->node) {
1123 WARN_ON(parent && parent != buf);
1124 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1125 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1126 parent_start = buf->start;
1127
1128 extent_buffer_get(cow);
Googler9398cc32022-12-02 17:21:52 +08001129 ret = tree_mod_log_insert_root(root->node, cow, 1);
1130 BUG_ON(ret < 0);
Googleraf606d22022-10-26 21:40:12 -07001131 rcu_assign_pointer(root->node, cow);
1132
1133 btrfs_free_tree_block(trans, root, buf, parent_start,
1134 last_ref);
1135 free_extent_buffer(buf);
1136 add_root_to_dirty_list(root);
1137 } else {
1138 WARN_ON(trans->transid != btrfs_header_generation(parent));
Googler9398cc32022-12-02 17:21:52 +08001139 tree_mod_log_insert_key(parent, parent_slot,
Googleraf606d22022-10-26 21:40:12 -07001140 MOD_LOG_KEY_REPLACE, GFP_NOFS);
1141 btrfs_set_node_blockptr(parent, parent_slot,
1142 cow->start);
1143 btrfs_set_node_ptr_generation(parent, parent_slot,
1144 trans->transid);
1145 btrfs_mark_buffer_dirty(parent);
1146 if (last_ref) {
Googler9398cc32022-12-02 17:21:52 +08001147 ret = tree_mod_log_free_eb(buf);
Googleraf606d22022-10-26 21:40:12 -07001148 if (ret) {
1149 btrfs_tree_unlock(cow);
1150 free_extent_buffer(cow);
1151 btrfs_abort_transaction(trans, ret);
1152 return ret;
1153 }
1154 }
1155 btrfs_free_tree_block(trans, root, buf, parent_start,
1156 last_ref);
1157 }
1158 if (unlock_orig)
1159 btrfs_tree_unlock(buf);
1160 free_extent_buffer_stale(buf);
1161 btrfs_mark_buffer_dirty(cow);
1162 *cow_ret = cow;
1163 return 0;
1164}
1165
1166/*
1167 * returns the logical address of the oldest predecessor of the given root.
1168 * entries older than time_seq are ignored.
1169 */
Googler9398cc32022-12-02 17:21:52 +08001170static struct tree_mod_elem *__tree_mod_log_oldest_root(
1171 struct extent_buffer *eb_root, u64 time_seq)
Googleraf606d22022-10-26 21:40:12 -07001172{
1173 struct tree_mod_elem *tm;
1174 struct tree_mod_elem *found = NULL;
1175 u64 root_logical = eb_root->start;
1176 int looped = 0;
1177
1178 if (!time_seq)
1179 return NULL;
1180
1181 /*
1182 * the very last operation that's logged for a root is the
1183 * replacement operation (if it is replaced at all). this has
1184 * the logical address of the *new* root, making it the very
1185 * first operation that's logged for this root.
1186 */
1187 while (1) {
Googler9398cc32022-12-02 17:21:52 +08001188 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
Googleraf606d22022-10-26 21:40:12 -07001189 time_seq);
1190 if (!looped && !tm)
1191 return NULL;
1192 /*
1193 * if there are no tree operation for the oldest root, we simply
1194 * return it. this should only happen if that (old) root is at
1195 * level 0.
1196 */
1197 if (!tm)
1198 break;
1199
1200 /*
1201 * if there's an operation that's not a root replacement, we
1202 * found the oldest version of our root. normally, we'll find a
1203 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1204 */
1205 if (tm->op != MOD_LOG_ROOT_REPLACE)
1206 break;
1207
1208 found = tm;
1209 root_logical = tm->old_root.logical;
1210 looped = 1;
1211 }
1212
1213 /* if there's no old root to return, return what we found instead */
1214 if (!found)
1215 found = tm;
1216
1217 return found;
1218}
1219
1220/*
1221 * tm is a pointer to the first operation to rewind within eb. then, all
1222 * previous operations will be rewound (until we reach something older than
1223 * time_seq).
1224 */
1225static void
1226__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1227 u64 time_seq, struct tree_mod_elem *first_tm)
1228{
1229 u32 n;
1230 struct rb_node *next;
1231 struct tree_mod_elem *tm = first_tm;
1232 unsigned long o_dst;
1233 unsigned long o_src;
1234 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1235
1236 n = btrfs_header_nritems(eb);
1237 read_lock(&fs_info->tree_mod_log_lock);
1238 while (tm && tm->seq >= time_seq) {
1239 /*
1240 * all the operations are recorded with the operator used for
1241 * the modification. as we're going backwards, we do the
1242 * opposite of each operation here.
1243 */
1244 switch (tm->op) {
1245 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1246 BUG_ON(tm->slot < n);
1247 /* Fallthrough */
1248 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1249 case MOD_LOG_KEY_REMOVE:
1250 btrfs_set_node_key(eb, &tm->key, tm->slot);
1251 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1252 btrfs_set_node_ptr_generation(eb, tm->slot,
1253 tm->generation);
1254 n++;
1255 break;
1256 case MOD_LOG_KEY_REPLACE:
1257 BUG_ON(tm->slot >= n);
1258 btrfs_set_node_key(eb, &tm->key, tm->slot);
1259 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1260 btrfs_set_node_ptr_generation(eb, tm->slot,
1261 tm->generation);
1262 break;
1263 case MOD_LOG_KEY_ADD:
1264 /* if a move operation is needed it's in the log */
1265 n--;
1266 break;
1267 case MOD_LOG_MOVE_KEYS:
1268 o_dst = btrfs_node_key_ptr_offset(tm->slot);
1269 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1270 memmove_extent_buffer(eb, o_dst, o_src,
1271 tm->move.nr_items * p_size);
1272 break;
1273 case MOD_LOG_ROOT_REPLACE:
1274 /*
1275 * this operation is special. for roots, this must be
1276 * handled explicitly before rewinding.
1277 * for non-roots, this operation may exist if the node
1278 * was a root: root A -> child B; then A gets empty and
1279 * B is promoted to the new root. in the mod log, we'll
1280 * have a root-replace operation for B, a tree block
1281 * that is no root. we simply ignore that operation.
1282 */
1283 break;
1284 }
1285 next = rb_next(&tm->node);
1286 if (!next)
1287 break;
Googler9398cc32022-12-02 17:21:52 +08001288 tm = rb_entry(next, struct tree_mod_elem, node);
Googleraf606d22022-10-26 21:40:12 -07001289 if (tm->logical != first_tm->logical)
1290 break;
1291 }
1292 read_unlock(&fs_info->tree_mod_log_lock);
1293 btrfs_set_header_nritems(eb, n);
1294}
1295
1296/*
1297 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1298 * is returned. If rewind operations happen, a fresh buffer is returned. The
1299 * returned buffer is always read-locked. If the returned buffer is not the
1300 * input buffer, the lock on the input buffer is released and the input buffer
1301 * is freed (its refcount is decremented).
1302 */
1303static struct extent_buffer *
1304tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1305 struct extent_buffer *eb, u64 time_seq)
1306{
1307 struct extent_buffer *eb_rewin;
1308 struct tree_mod_elem *tm;
1309
1310 if (!time_seq)
1311 return eb;
1312
1313 if (btrfs_header_level(eb) == 0)
1314 return eb;
1315
1316 tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1317 if (!tm)
1318 return eb;
1319
1320 btrfs_set_path_blocking(path);
Googler9398cc32022-12-02 17:21:52 +08001321 btrfs_set_lock_blocking_read(eb);
Googleraf606d22022-10-26 21:40:12 -07001322
1323 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1324 BUG_ON(tm->slot != 0);
Googler9398cc32022-12-02 17:21:52 +08001325 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
Googleraf606d22022-10-26 21:40:12 -07001326 if (!eb_rewin) {
1327 btrfs_tree_read_unlock_blocking(eb);
1328 free_extent_buffer(eb);
1329 return NULL;
1330 }
1331 btrfs_set_header_bytenr(eb_rewin, eb->start);
1332 btrfs_set_header_backref_rev(eb_rewin,
1333 btrfs_header_backref_rev(eb));
1334 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1335 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1336 } else {
1337 eb_rewin = btrfs_clone_extent_buffer(eb);
1338 if (!eb_rewin) {
1339 btrfs_tree_read_unlock_blocking(eb);
1340 free_extent_buffer(eb);
1341 return NULL;
1342 }
1343 }
1344
Googleraf606d22022-10-26 21:40:12 -07001345 btrfs_tree_read_unlock_blocking(eb);
1346 free_extent_buffer(eb);
1347
1348 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
1349 eb_rewin, btrfs_header_level(eb_rewin));
1350 btrfs_tree_read_lock(eb_rewin);
1351 __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1352 WARN_ON(btrfs_header_nritems(eb_rewin) >
Googler9398cc32022-12-02 17:21:52 +08001353 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
Googleraf606d22022-10-26 21:40:12 -07001354
1355 return eb_rewin;
1356}
1357
1358/*
1359 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1360 * value. If there are no changes, the current root->root_node is returned. If
1361 * anything changed in between, there's a fresh buffer allocated on which the
1362 * rewind operations are done. In any case, the returned buffer is read locked.
1363 * Returns NULL on error (with no locks held).
1364 */
1365static inline struct extent_buffer *
1366get_old_root(struct btrfs_root *root, u64 time_seq)
1367{
Googler9398cc32022-12-02 17:21:52 +08001368 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -07001369 struct tree_mod_elem *tm;
1370 struct extent_buffer *eb = NULL;
1371 struct extent_buffer *eb_root;
1372 u64 eb_root_owner = 0;
1373 struct extent_buffer *old;
1374 struct tree_mod_root *old_root = NULL;
1375 u64 old_generation = 0;
1376 u64 logical;
Googler9398cc32022-12-02 17:21:52 +08001377 int level;
Googleraf606d22022-10-26 21:40:12 -07001378
1379 eb_root = btrfs_read_lock_root_node(root);
Googler9398cc32022-12-02 17:21:52 +08001380 tm = __tree_mod_log_oldest_root(eb_root, time_seq);
Googleraf606d22022-10-26 21:40:12 -07001381 if (!tm)
1382 return eb_root;
1383
1384 if (tm->op == MOD_LOG_ROOT_REPLACE) {
1385 old_root = &tm->old_root;
1386 old_generation = tm->generation;
1387 logical = old_root->logical;
Googler9398cc32022-12-02 17:21:52 +08001388 level = old_root->level;
Googleraf606d22022-10-26 21:40:12 -07001389 } else {
1390 logical = eb_root->start;
Googler9398cc32022-12-02 17:21:52 +08001391 level = btrfs_header_level(eb_root);
Googleraf606d22022-10-26 21:40:12 -07001392 }
1393
Googler9398cc32022-12-02 17:21:52 +08001394 tm = tree_mod_log_search(fs_info, logical, time_seq);
Googleraf606d22022-10-26 21:40:12 -07001395 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1396 btrfs_tree_read_unlock(eb_root);
1397 free_extent_buffer(eb_root);
Googler9398cc32022-12-02 17:21:52 +08001398 old = read_tree_block(fs_info, logical, 0, level, NULL);
Googleraf606d22022-10-26 21:40:12 -07001399 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1400 if (!IS_ERR(old))
1401 free_extent_buffer(old);
Googler9398cc32022-12-02 17:21:52 +08001402 btrfs_warn(fs_info,
1403 "failed to read tree block %llu from get_old_root",
1404 logical);
Googleraf606d22022-10-26 21:40:12 -07001405 } else {
Googleraf606d22022-10-26 21:40:12 -07001406 eb = btrfs_clone_extent_buffer(old);
Googleraf606d22022-10-26 21:40:12 -07001407 free_extent_buffer(old);
Googleraf606d22022-10-26 21:40:12 -07001408 }
1409 } else if (old_root) {
1410 eb_root_owner = btrfs_header_owner(eb_root);
1411 btrfs_tree_read_unlock(eb_root);
1412 free_extent_buffer(eb_root);
Googler9398cc32022-12-02 17:21:52 +08001413 eb = alloc_dummy_extent_buffer(fs_info, logical);
Googleraf606d22022-10-26 21:40:12 -07001414 } else {
Googler9398cc32022-12-02 17:21:52 +08001415 btrfs_set_lock_blocking_read(eb_root);
Googleraf606d22022-10-26 21:40:12 -07001416 eb = btrfs_clone_extent_buffer(eb_root);
1417 btrfs_tree_read_unlock_blocking(eb_root);
1418 free_extent_buffer(eb_root);
1419 }
1420
1421 if (!eb)
1422 return NULL;
1423 if (old_root) {
1424 btrfs_set_header_bytenr(eb, eb->start);
1425 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1426 btrfs_set_header_owner(eb, eb_root_owner);
1427 btrfs_set_header_level(eb, old_root->level);
1428 btrfs_set_header_generation(eb, old_generation);
1429 }
1430 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1431 btrfs_header_level(eb));
1432 btrfs_tree_read_lock(eb);
1433 if (tm)
Googler9398cc32022-12-02 17:21:52 +08001434 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
Googleraf606d22022-10-26 21:40:12 -07001435 else
1436 WARN_ON(btrfs_header_level(eb) != 0);
Googler9398cc32022-12-02 17:21:52 +08001437 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
Googleraf606d22022-10-26 21:40:12 -07001438
1439 return eb;
1440}
1441
1442int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1443{
1444 struct tree_mod_elem *tm;
1445 int level;
1446 struct extent_buffer *eb_root = btrfs_root_node(root);
1447
Googler9398cc32022-12-02 17:21:52 +08001448 tm = __tree_mod_log_oldest_root(eb_root, time_seq);
Googleraf606d22022-10-26 21:40:12 -07001449 if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1450 level = tm->old_root.level;
1451 } else {
1452 level = btrfs_header_level(eb_root);
1453 }
1454 free_extent_buffer(eb_root);
1455
1456 return level;
1457}
1458
1459static inline int should_cow_block(struct btrfs_trans_handle *trans,
1460 struct btrfs_root *root,
1461 struct extent_buffer *buf)
1462{
1463 if (btrfs_is_testing(root->fs_info))
1464 return 0;
1465
Googler9398cc32022-12-02 17:21:52 +08001466 /* Ensure we can see the FORCE_COW bit */
1467 smp_mb__before_atomic();
Googleraf606d22022-10-26 21:40:12 -07001468
1469 /*
1470 * We do not need to cow a block if
1471 * 1) this block is not created or changed in this transaction;
1472 * 2) this block does not belong to TREE_RELOC tree;
1473 * 3) the root is not forced COW.
1474 *
1475 * What is forced COW:
1476 * when we create snapshot during committing the transaction,
Googler9398cc32022-12-02 17:21:52 +08001477 * after we've finished copying src root, we must COW the shared
Googleraf606d22022-10-26 21:40:12 -07001478 * block to ensure the metadata consistency.
1479 */
1480 if (btrfs_header_generation(buf) == trans->transid &&
1481 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1482 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1483 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1484 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1485 return 0;
1486 return 1;
1487}
1488
1489/*
1490 * cows a single block, see __btrfs_cow_block for the real work.
1491 * This version of it has extra checks so that a block isn't COWed more than
1492 * once per transaction, as long as it hasn't been written yet
1493 */
1494noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1495 struct btrfs_root *root, struct extent_buffer *buf,
1496 struct extent_buffer *parent, int parent_slot,
1497 struct extent_buffer **cow_ret)
1498{
Googler9398cc32022-12-02 17:21:52 +08001499 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -07001500 u64 search_start;
1501 int ret;
1502
Googler9398cc32022-12-02 17:21:52 +08001503 if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1504 btrfs_err(fs_info,
1505 "COW'ing blocks on a fs root that's being dropped");
1506
1507 if (trans->transaction != fs_info->running_transaction)
Googleraf606d22022-10-26 21:40:12 -07001508 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1509 trans->transid,
Googler9398cc32022-12-02 17:21:52 +08001510 fs_info->running_transaction->transid);
Googleraf606d22022-10-26 21:40:12 -07001511
Googler9398cc32022-12-02 17:21:52 +08001512 if (trans->transid != fs_info->generation)
Googleraf606d22022-10-26 21:40:12 -07001513 WARN(1, KERN_CRIT "trans %llu running %llu\n",
Googler9398cc32022-12-02 17:21:52 +08001514 trans->transid, fs_info->generation);
Googleraf606d22022-10-26 21:40:12 -07001515
1516 if (!should_cow_block(trans, root, buf)) {
1517 trans->dirty = true;
1518 *cow_ret = buf;
1519 return 0;
1520 }
1521
1522 search_start = buf->start & ~((u64)SZ_1G - 1);
1523
1524 if (parent)
Googler9398cc32022-12-02 17:21:52 +08001525 btrfs_set_lock_blocking_write(parent);
1526 btrfs_set_lock_blocking_write(buf);
Googleraf606d22022-10-26 21:40:12 -07001527
Googler9398cc32022-12-02 17:21:52 +08001528 /*
1529 * Before CoWing this block for later modification, check if it's
1530 * the subtree root and do the delayed subtree trace if needed.
1531 *
1532 * Also We don't care about the error, as it's handled internally.
1533 */
1534 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
Googleraf606d22022-10-26 21:40:12 -07001535 ret = __btrfs_cow_block(trans, root, buf, parent,
1536 parent_slot, cow_ret, search_start, 0);
1537
1538 trace_btrfs_cow_block(root, buf, *cow_ret);
1539
1540 return ret;
1541}
1542
1543/*
1544 * helper function for defrag to decide if two blocks pointed to by a
1545 * node are actually close by
1546 */
1547static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1548{
1549 if (blocknr < other && other - (blocknr + blocksize) < 32768)
1550 return 1;
1551 if (blocknr > other && blocknr - (other + blocksize) < 32768)
1552 return 1;
1553 return 0;
1554}
1555
1556/*
1557 * compare two keys in a memcmp fashion
1558 */
Googler9398cc32022-12-02 17:21:52 +08001559static int comp_keys(const struct btrfs_disk_key *disk,
1560 const struct btrfs_key *k2)
Googleraf606d22022-10-26 21:40:12 -07001561{
1562 struct btrfs_key k1;
1563
1564 btrfs_disk_key_to_cpu(&k1, disk);
1565
1566 return btrfs_comp_cpu_keys(&k1, k2);
1567}
1568
1569/*
1570 * same as comp_keys only with two btrfs_key's
1571 */
Googler9398cc32022-12-02 17:21:52 +08001572int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
Googleraf606d22022-10-26 21:40:12 -07001573{
1574 if (k1->objectid > k2->objectid)
1575 return 1;
1576 if (k1->objectid < k2->objectid)
1577 return -1;
1578 if (k1->type > k2->type)
1579 return 1;
1580 if (k1->type < k2->type)
1581 return -1;
1582 if (k1->offset > k2->offset)
1583 return 1;
1584 if (k1->offset < k2->offset)
1585 return -1;
1586 return 0;
1587}
1588
1589/*
1590 * this is used by the defrag code to go through all the
1591 * leaves pointed to by a node and reallocate them so that
1592 * disk order is close to key order
1593 */
1594int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1595 struct btrfs_root *root, struct extent_buffer *parent,
1596 int start_slot, u64 *last_ret,
1597 struct btrfs_key *progress)
1598{
Googler9398cc32022-12-02 17:21:52 +08001599 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -07001600 struct extent_buffer *cur;
1601 u64 blocknr;
1602 u64 gen;
1603 u64 search_start = *last_ret;
1604 u64 last_block = 0;
1605 u64 other;
1606 u32 parent_nritems;
1607 int end_slot;
1608 int i;
1609 int err = 0;
1610 int parent_level;
1611 int uptodate;
1612 u32 blocksize;
1613 int progress_passed = 0;
1614 struct btrfs_disk_key disk_key;
1615
1616 parent_level = btrfs_header_level(parent);
1617
Googler9398cc32022-12-02 17:21:52 +08001618 WARN_ON(trans->transaction != fs_info->running_transaction);
1619 WARN_ON(trans->transid != fs_info->generation);
Googleraf606d22022-10-26 21:40:12 -07001620
1621 parent_nritems = btrfs_header_nritems(parent);
Googler9398cc32022-12-02 17:21:52 +08001622 blocksize = fs_info->nodesize;
Googleraf606d22022-10-26 21:40:12 -07001623 end_slot = parent_nritems - 1;
1624
1625 if (parent_nritems <= 1)
1626 return 0;
1627
Googler9398cc32022-12-02 17:21:52 +08001628 btrfs_set_lock_blocking_write(parent);
Googleraf606d22022-10-26 21:40:12 -07001629
1630 for (i = start_slot; i <= end_slot; i++) {
Googler9398cc32022-12-02 17:21:52 +08001631 struct btrfs_key first_key;
Googleraf606d22022-10-26 21:40:12 -07001632 int close = 1;
1633
1634 btrfs_node_key(parent, &disk_key, i);
1635 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1636 continue;
1637
1638 progress_passed = 1;
1639 blocknr = btrfs_node_blockptr(parent, i);
1640 gen = btrfs_node_ptr_generation(parent, i);
Googler9398cc32022-12-02 17:21:52 +08001641 btrfs_node_key_to_cpu(parent, &first_key, i);
Googleraf606d22022-10-26 21:40:12 -07001642 if (last_block == 0)
1643 last_block = blocknr;
1644
1645 if (i > 0) {
1646 other = btrfs_node_blockptr(parent, i - 1);
1647 close = close_blocks(blocknr, other, blocksize);
1648 }
1649 if (!close && i < end_slot) {
1650 other = btrfs_node_blockptr(parent, i + 1);
1651 close = close_blocks(blocknr, other, blocksize);
1652 }
1653 if (close) {
1654 last_block = blocknr;
1655 continue;
1656 }
1657
Googler9398cc32022-12-02 17:21:52 +08001658 cur = find_extent_buffer(fs_info, blocknr);
Googleraf606d22022-10-26 21:40:12 -07001659 if (cur)
1660 uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1661 else
1662 uptodate = 0;
1663 if (!cur || !uptodate) {
1664 if (!cur) {
Googler9398cc32022-12-02 17:21:52 +08001665 cur = read_tree_block(fs_info, blocknr, gen,
1666 parent_level - 1,
1667 &first_key);
Googleraf606d22022-10-26 21:40:12 -07001668 if (IS_ERR(cur)) {
1669 return PTR_ERR(cur);
1670 } else if (!extent_buffer_uptodate(cur)) {
1671 free_extent_buffer(cur);
1672 return -EIO;
1673 }
1674 } else if (!uptodate) {
Googler9398cc32022-12-02 17:21:52 +08001675 err = btrfs_read_buffer(cur, gen,
1676 parent_level - 1,&first_key);
Googleraf606d22022-10-26 21:40:12 -07001677 if (err) {
1678 free_extent_buffer(cur);
1679 return err;
1680 }
1681 }
1682 }
1683 if (search_start == 0)
1684 search_start = last_block;
1685
1686 btrfs_tree_lock(cur);
Googler9398cc32022-12-02 17:21:52 +08001687 btrfs_set_lock_blocking_write(cur);
Googleraf606d22022-10-26 21:40:12 -07001688 err = __btrfs_cow_block(trans, root, cur, parent, i,
1689 &cur, search_start,
1690 min(16 * blocksize,
1691 (end_slot - i) * blocksize));
1692 if (err) {
1693 btrfs_tree_unlock(cur);
1694 free_extent_buffer(cur);
1695 break;
1696 }
1697 search_start = cur->start;
1698 last_block = cur->start;
1699 *last_ret = search_start;
1700 btrfs_tree_unlock(cur);
1701 free_extent_buffer(cur);
1702 }
1703 return err;
1704}
1705
1706/*
1707 * search for key in the extent_buffer. The items start at offset p,
1708 * and they are item_size apart. There are 'max' items in p.
1709 *
1710 * the slot in the array is returned via slot, and it points to
1711 * the place where you would insert key if it is not found in
1712 * the array.
1713 *
1714 * slot may point to max if the key is bigger than all of the keys
1715 */
1716static noinline int generic_bin_search(struct extent_buffer *eb,
Googler9398cc32022-12-02 17:21:52 +08001717 unsigned long p, int item_size,
1718 const struct btrfs_key *key,
Googleraf606d22022-10-26 21:40:12 -07001719 int max, int *slot)
1720{
1721 int low = 0;
1722 int high = max;
1723 int mid;
1724 int ret;
1725 struct btrfs_disk_key *tmp = NULL;
1726 struct btrfs_disk_key unaligned;
1727 unsigned long offset;
1728 char *kaddr = NULL;
1729 unsigned long map_start = 0;
1730 unsigned long map_len = 0;
1731 int err;
1732
1733 if (low > high) {
1734 btrfs_err(eb->fs_info,
1735 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1736 __func__, low, high, eb->start,
1737 btrfs_header_owner(eb), btrfs_header_level(eb));
1738 return -EINVAL;
1739 }
1740
1741 while (low < high) {
1742 mid = (low + high) / 2;
1743 offset = p + mid * item_size;
1744
1745 if (!kaddr || offset < map_start ||
1746 (offset + sizeof(struct btrfs_disk_key)) >
1747 map_start + map_len) {
1748
1749 err = map_private_extent_buffer(eb, offset,
1750 sizeof(struct btrfs_disk_key),
1751 &kaddr, &map_start, &map_len);
1752
1753 if (!err) {
1754 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1755 map_start);
1756 } else if (err == 1) {
1757 read_extent_buffer(eb, &unaligned,
1758 offset, sizeof(unaligned));
1759 tmp = &unaligned;
1760 } else {
1761 return err;
1762 }
1763
1764 } else {
1765 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1766 map_start);
1767 }
1768 ret = comp_keys(tmp, key);
1769
1770 if (ret < 0)
1771 low = mid + 1;
1772 else if (ret > 0)
1773 high = mid;
1774 else {
1775 *slot = mid;
1776 return 0;
1777 }
1778 }
1779 *slot = low;
1780 return 1;
1781}
1782
1783/*
1784 * simple bin_search frontend that does the right thing for
1785 * leaves vs nodes
1786 */
Googler9398cc32022-12-02 17:21:52 +08001787int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1788 int level, int *slot)
Googleraf606d22022-10-26 21:40:12 -07001789{
1790 if (level == 0)
1791 return generic_bin_search(eb,
1792 offsetof(struct btrfs_leaf, items),
1793 sizeof(struct btrfs_item),
1794 key, btrfs_header_nritems(eb),
1795 slot);
1796 else
1797 return generic_bin_search(eb,
1798 offsetof(struct btrfs_node, ptrs),
1799 sizeof(struct btrfs_key_ptr),
1800 key, btrfs_header_nritems(eb),
1801 slot);
1802}
1803
1804static void root_add_used(struct btrfs_root *root, u32 size)
1805{
1806 spin_lock(&root->accounting_lock);
1807 btrfs_set_root_used(&root->root_item,
1808 btrfs_root_used(&root->root_item) + size);
1809 spin_unlock(&root->accounting_lock);
1810}
1811
1812static void root_sub_used(struct btrfs_root *root, u32 size)
1813{
1814 spin_lock(&root->accounting_lock);
1815 btrfs_set_root_used(&root->root_item,
1816 btrfs_root_used(&root->root_item) - size);
1817 spin_unlock(&root->accounting_lock);
1818}
1819
1820/* given a node and slot number, this reads the blocks it points to. The
1821 * extent buffer is returned with a reference taken (but unlocked).
1822 */
Googler9398cc32022-12-02 17:21:52 +08001823struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
1824 int slot)
Googleraf606d22022-10-26 21:40:12 -07001825{
1826 int level = btrfs_header_level(parent);
1827 struct extent_buffer *eb;
Googler9398cc32022-12-02 17:21:52 +08001828 struct btrfs_key first_key;
Googleraf606d22022-10-26 21:40:12 -07001829
1830 if (slot < 0 || slot >= btrfs_header_nritems(parent))
1831 return ERR_PTR(-ENOENT);
1832
1833 BUG_ON(level == 0);
1834
Googler9398cc32022-12-02 17:21:52 +08001835 btrfs_node_key_to_cpu(parent, &first_key, slot);
1836 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1837 btrfs_node_ptr_generation(parent, slot),
1838 level - 1, &first_key);
Googleraf606d22022-10-26 21:40:12 -07001839 if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1840 free_extent_buffer(eb);
1841 eb = ERR_PTR(-EIO);
1842 }
1843
1844 return eb;
1845}
1846
1847/*
1848 * node level balancing, used to make sure nodes are in proper order for
1849 * item deletion. We balance from the top down, so we have to make sure
1850 * that a deletion won't leave an node completely empty later on.
1851 */
1852static noinline int balance_level(struct btrfs_trans_handle *trans,
1853 struct btrfs_root *root,
1854 struct btrfs_path *path, int level)
1855{
Googler9398cc32022-12-02 17:21:52 +08001856 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -07001857 struct extent_buffer *right = NULL;
1858 struct extent_buffer *mid;
1859 struct extent_buffer *left = NULL;
1860 struct extent_buffer *parent = NULL;
1861 int ret = 0;
1862 int wret;
1863 int pslot;
1864 int orig_slot = path->slots[level];
1865 u64 orig_ptr;
1866
Googler9398cc32022-12-02 17:21:52 +08001867 ASSERT(level > 0);
Googleraf606d22022-10-26 21:40:12 -07001868
1869 mid = path->nodes[level];
1870
1871 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1872 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1873 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1874
1875 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1876
1877 if (level < BTRFS_MAX_LEVEL - 1) {
1878 parent = path->nodes[level + 1];
1879 pslot = path->slots[level + 1];
1880 }
1881
1882 /*
1883 * deal with the case where there is only one pointer in the root
1884 * by promoting the node below to a root
1885 */
1886 if (!parent) {
1887 struct extent_buffer *child;
1888
1889 if (btrfs_header_nritems(mid) != 1)
1890 return 0;
1891
1892 /* promote the child to a root */
Googler9398cc32022-12-02 17:21:52 +08001893 child = btrfs_read_node_slot(mid, 0);
Googleraf606d22022-10-26 21:40:12 -07001894 if (IS_ERR(child)) {
1895 ret = PTR_ERR(child);
Googler9398cc32022-12-02 17:21:52 +08001896 btrfs_handle_fs_error(fs_info, ret, NULL);
Googleraf606d22022-10-26 21:40:12 -07001897 goto enospc;
1898 }
1899
1900 btrfs_tree_lock(child);
Googler9398cc32022-12-02 17:21:52 +08001901 btrfs_set_lock_blocking_write(child);
Googleraf606d22022-10-26 21:40:12 -07001902 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1903 if (ret) {
1904 btrfs_tree_unlock(child);
1905 free_extent_buffer(child);
1906 goto enospc;
1907 }
1908
Googler9398cc32022-12-02 17:21:52 +08001909 ret = tree_mod_log_insert_root(root->node, child, 1);
1910 BUG_ON(ret < 0);
Googleraf606d22022-10-26 21:40:12 -07001911 rcu_assign_pointer(root->node, child);
1912
1913 add_root_to_dirty_list(root);
1914 btrfs_tree_unlock(child);
1915
1916 path->locks[level] = 0;
1917 path->nodes[level] = NULL;
Googler9398cc32022-12-02 17:21:52 +08001918 btrfs_clean_tree_block(mid);
Googleraf606d22022-10-26 21:40:12 -07001919 btrfs_tree_unlock(mid);
1920 /* once for the path */
1921 free_extent_buffer(mid);
1922
1923 root_sub_used(root, mid->len);
1924 btrfs_free_tree_block(trans, root, mid, 0, 1);
1925 /* once for the root ptr */
1926 free_extent_buffer_stale(mid);
1927 return 0;
1928 }
1929 if (btrfs_header_nritems(mid) >
Googler9398cc32022-12-02 17:21:52 +08001930 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
Googleraf606d22022-10-26 21:40:12 -07001931 return 0;
1932
Googler9398cc32022-12-02 17:21:52 +08001933 left = btrfs_read_node_slot(parent, pslot - 1);
Googleraf606d22022-10-26 21:40:12 -07001934 if (IS_ERR(left))
1935 left = NULL;
1936
1937 if (left) {
1938 btrfs_tree_lock(left);
Googler9398cc32022-12-02 17:21:52 +08001939 btrfs_set_lock_blocking_write(left);
Googleraf606d22022-10-26 21:40:12 -07001940 wret = btrfs_cow_block(trans, root, left,
1941 parent, pslot - 1, &left);
1942 if (wret) {
1943 ret = wret;
1944 goto enospc;
1945 }
1946 }
1947
Googler9398cc32022-12-02 17:21:52 +08001948 right = btrfs_read_node_slot(parent, pslot + 1);
Googleraf606d22022-10-26 21:40:12 -07001949 if (IS_ERR(right))
1950 right = NULL;
1951
1952 if (right) {
1953 btrfs_tree_lock(right);
Googler9398cc32022-12-02 17:21:52 +08001954 btrfs_set_lock_blocking_write(right);
Googleraf606d22022-10-26 21:40:12 -07001955 wret = btrfs_cow_block(trans, root, right,
1956 parent, pslot + 1, &right);
1957 if (wret) {
1958 ret = wret;
1959 goto enospc;
1960 }
1961 }
1962
1963 /* first, try to make some room in the middle buffer */
1964 if (left) {
1965 orig_slot += btrfs_header_nritems(left);
Googler9398cc32022-12-02 17:21:52 +08001966 wret = push_node_left(trans, left, mid, 1);
Googleraf606d22022-10-26 21:40:12 -07001967 if (wret < 0)
1968 ret = wret;
1969 }
1970
1971 /*
1972 * then try to empty the right most buffer into the middle
1973 */
1974 if (right) {
Googler9398cc32022-12-02 17:21:52 +08001975 wret = push_node_left(trans, mid, right, 1);
Googleraf606d22022-10-26 21:40:12 -07001976 if (wret < 0 && wret != -ENOSPC)
1977 ret = wret;
1978 if (btrfs_header_nritems(right) == 0) {
Googler9398cc32022-12-02 17:21:52 +08001979 btrfs_clean_tree_block(right);
Googleraf606d22022-10-26 21:40:12 -07001980 btrfs_tree_unlock(right);
1981 del_ptr(root, path, level + 1, pslot + 1);
1982 root_sub_used(root, right->len);
1983 btrfs_free_tree_block(trans, root, right, 0, 1);
1984 free_extent_buffer_stale(right);
1985 right = NULL;
1986 } else {
1987 struct btrfs_disk_key right_key;
1988 btrfs_node_key(right, &right_key, 0);
Googler9398cc32022-12-02 17:21:52 +08001989 ret = tree_mod_log_insert_key(parent, pslot + 1,
1990 MOD_LOG_KEY_REPLACE, GFP_NOFS);
1991 BUG_ON(ret < 0);
Googleraf606d22022-10-26 21:40:12 -07001992 btrfs_set_node_key(parent, &right_key, pslot + 1);
1993 btrfs_mark_buffer_dirty(parent);
1994 }
1995 }
1996 if (btrfs_header_nritems(mid) == 1) {
1997 /*
1998 * we're not allowed to leave a node with one item in the
1999 * tree during a delete. A deletion from lower in the tree
2000 * could try to delete the only pointer in this node.
2001 * So, pull some keys from the left.
2002 * There has to be a left pointer at this point because
2003 * otherwise we would have pulled some pointers from the
2004 * right
2005 */
2006 if (!left) {
2007 ret = -EROFS;
Googler9398cc32022-12-02 17:21:52 +08002008 btrfs_handle_fs_error(fs_info, ret, NULL);
Googleraf606d22022-10-26 21:40:12 -07002009 goto enospc;
2010 }
Googler9398cc32022-12-02 17:21:52 +08002011 wret = balance_node_right(trans, mid, left);
Googleraf606d22022-10-26 21:40:12 -07002012 if (wret < 0) {
2013 ret = wret;
2014 goto enospc;
2015 }
2016 if (wret == 1) {
Googler9398cc32022-12-02 17:21:52 +08002017 wret = push_node_left(trans, left, mid, 1);
Googleraf606d22022-10-26 21:40:12 -07002018 if (wret < 0)
2019 ret = wret;
2020 }
2021 BUG_ON(wret == 1);
2022 }
2023 if (btrfs_header_nritems(mid) == 0) {
Googler9398cc32022-12-02 17:21:52 +08002024 btrfs_clean_tree_block(mid);
Googleraf606d22022-10-26 21:40:12 -07002025 btrfs_tree_unlock(mid);
2026 del_ptr(root, path, level + 1, pslot);
2027 root_sub_used(root, mid->len);
2028 btrfs_free_tree_block(trans, root, mid, 0, 1);
2029 free_extent_buffer_stale(mid);
2030 mid = NULL;
2031 } else {
2032 /* update the parent key to reflect our changes */
2033 struct btrfs_disk_key mid_key;
2034 btrfs_node_key(mid, &mid_key, 0);
Googler9398cc32022-12-02 17:21:52 +08002035 ret = tree_mod_log_insert_key(parent, pslot,
2036 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2037 BUG_ON(ret < 0);
Googleraf606d22022-10-26 21:40:12 -07002038 btrfs_set_node_key(parent, &mid_key, pslot);
2039 btrfs_mark_buffer_dirty(parent);
2040 }
2041
2042 /* update the path */
2043 if (left) {
2044 if (btrfs_header_nritems(left) > orig_slot) {
2045 extent_buffer_get(left);
2046 /* left was locked after cow */
2047 path->nodes[level] = left;
2048 path->slots[level + 1] -= 1;
2049 path->slots[level] = orig_slot;
2050 if (mid) {
2051 btrfs_tree_unlock(mid);
2052 free_extent_buffer(mid);
2053 }
2054 } else {
2055 orig_slot -= btrfs_header_nritems(left);
2056 path->slots[level] = orig_slot;
2057 }
2058 }
2059 /* double check we haven't messed things up */
2060 if (orig_ptr !=
2061 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2062 BUG();
2063enospc:
2064 if (right) {
2065 btrfs_tree_unlock(right);
2066 free_extent_buffer(right);
2067 }
2068 if (left) {
2069 if (path->nodes[level] != left)
2070 btrfs_tree_unlock(left);
2071 free_extent_buffer(left);
2072 }
2073 return ret;
2074}
2075
2076/* Node balancing for insertion. Here we only split or push nodes around
2077 * when they are completely full. This is also done top down, so we
2078 * have to be pessimistic.
2079 */
2080static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2081 struct btrfs_root *root,
2082 struct btrfs_path *path, int level)
2083{
Googler9398cc32022-12-02 17:21:52 +08002084 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -07002085 struct extent_buffer *right = NULL;
2086 struct extent_buffer *mid;
2087 struct extent_buffer *left = NULL;
2088 struct extent_buffer *parent = NULL;
2089 int ret = 0;
2090 int wret;
2091 int pslot;
2092 int orig_slot = path->slots[level];
2093
2094 if (level == 0)
2095 return 1;
2096
2097 mid = path->nodes[level];
2098 WARN_ON(btrfs_header_generation(mid) != trans->transid);
2099
2100 if (level < BTRFS_MAX_LEVEL - 1) {
2101 parent = path->nodes[level + 1];
2102 pslot = path->slots[level + 1];
2103 }
2104
2105 if (!parent)
2106 return 1;
2107
Googler9398cc32022-12-02 17:21:52 +08002108 left = btrfs_read_node_slot(parent, pslot - 1);
Googleraf606d22022-10-26 21:40:12 -07002109 if (IS_ERR(left))
2110 left = NULL;
2111
2112 /* first, try to make some room in the middle buffer */
2113 if (left) {
2114 u32 left_nr;
2115
2116 btrfs_tree_lock(left);
Googler9398cc32022-12-02 17:21:52 +08002117 btrfs_set_lock_blocking_write(left);
Googleraf606d22022-10-26 21:40:12 -07002118
2119 left_nr = btrfs_header_nritems(left);
Googler9398cc32022-12-02 17:21:52 +08002120 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
Googleraf606d22022-10-26 21:40:12 -07002121 wret = 1;
2122 } else {
2123 ret = btrfs_cow_block(trans, root, left, parent,
2124 pslot - 1, &left);
2125 if (ret)
2126 wret = 1;
2127 else {
Googler9398cc32022-12-02 17:21:52 +08002128 wret = push_node_left(trans, left, mid, 0);
Googleraf606d22022-10-26 21:40:12 -07002129 }
2130 }
2131 if (wret < 0)
2132 ret = wret;
2133 if (wret == 0) {
2134 struct btrfs_disk_key disk_key;
2135 orig_slot += left_nr;
2136 btrfs_node_key(mid, &disk_key, 0);
Googler9398cc32022-12-02 17:21:52 +08002137 ret = tree_mod_log_insert_key(parent, pslot,
2138 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2139 BUG_ON(ret < 0);
Googleraf606d22022-10-26 21:40:12 -07002140 btrfs_set_node_key(parent, &disk_key, pslot);
2141 btrfs_mark_buffer_dirty(parent);
2142 if (btrfs_header_nritems(left) > orig_slot) {
2143 path->nodes[level] = left;
2144 path->slots[level + 1] -= 1;
2145 path->slots[level] = orig_slot;
2146 btrfs_tree_unlock(mid);
2147 free_extent_buffer(mid);
2148 } else {
2149 orig_slot -=
2150 btrfs_header_nritems(left);
2151 path->slots[level] = orig_slot;
2152 btrfs_tree_unlock(left);
2153 free_extent_buffer(left);
2154 }
2155 return 0;
2156 }
2157 btrfs_tree_unlock(left);
2158 free_extent_buffer(left);
2159 }
Googler9398cc32022-12-02 17:21:52 +08002160 right = btrfs_read_node_slot(parent, pslot + 1);
Googleraf606d22022-10-26 21:40:12 -07002161 if (IS_ERR(right))
2162 right = NULL;
2163
2164 /*
2165 * then try to empty the right most buffer into the middle
2166 */
2167 if (right) {
2168 u32 right_nr;
2169
2170 btrfs_tree_lock(right);
Googler9398cc32022-12-02 17:21:52 +08002171 btrfs_set_lock_blocking_write(right);
Googleraf606d22022-10-26 21:40:12 -07002172
2173 right_nr = btrfs_header_nritems(right);
Googler9398cc32022-12-02 17:21:52 +08002174 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
Googleraf606d22022-10-26 21:40:12 -07002175 wret = 1;
2176 } else {
2177 ret = btrfs_cow_block(trans, root, right,
2178 parent, pslot + 1,
2179 &right);
2180 if (ret)
2181 wret = 1;
2182 else {
Googler9398cc32022-12-02 17:21:52 +08002183 wret = balance_node_right(trans, right, mid);
Googleraf606d22022-10-26 21:40:12 -07002184 }
2185 }
2186 if (wret < 0)
2187 ret = wret;
2188 if (wret == 0) {
2189 struct btrfs_disk_key disk_key;
2190
2191 btrfs_node_key(right, &disk_key, 0);
Googler9398cc32022-12-02 17:21:52 +08002192 ret = tree_mod_log_insert_key(parent, pslot + 1,
2193 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2194 BUG_ON(ret < 0);
Googleraf606d22022-10-26 21:40:12 -07002195 btrfs_set_node_key(parent, &disk_key, pslot + 1);
2196 btrfs_mark_buffer_dirty(parent);
2197
2198 if (btrfs_header_nritems(mid) <= orig_slot) {
2199 path->nodes[level] = right;
2200 path->slots[level + 1] += 1;
2201 path->slots[level] = orig_slot -
2202 btrfs_header_nritems(mid);
2203 btrfs_tree_unlock(mid);
2204 free_extent_buffer(mid);
2205 } else {
2206 btrfs_tree_unlock(right);
2207 free_extent_buffer(right);
2208 }
2209 return 0;
2210 }
2211 btrfs_tree_unlock(right);
2212 free_extent_buffer(right);
2213 }
2214 return 1;
2215}
2216
2217/*
2218 * readahead one full node of leaves, finding things that are close
2219 * to the block in 'slot', and triggering ra on them.
2220 */
Googler9398cc32022-12-02 17:21:52 +08002221static void reada_for_search(struct btrfs_fs_info *fs_info,
Googleraf606d22022-10-26 21:40:12 -07002222 struct btrfs_path *path,
2223 int level, int slot, u64 objectid)
2224{
2225 struct extent_buffer *node;
2226 struct btrfs_disk_key disk_key;
2227 u32 nritems;
2228 u64 search;
2229 u64 target;
2230 u64 nread = 0;
2231 struct extent_buffer *eb;
2232 u32 nr;
2233 u32 blocksize;
2234 u32 nscan = 0;
2235
2236 if (level != 1)
2237 return;
2238
2239 if (!path->nodes[level])
2240 return;
2241
2242 node = path->nodes[level];
2243
2244 search = btrfs_node_blockptr(node, slot);
Googler9398cc32022-12-02 17:21:52 +08002245 blocksize = fs_info->nodesize;
2246 eb = find_extent_buffer(fs_info, search);
Googleraf606d22022-10-26 21:40:12 -07002247 if (eb) {
2248 free_extent_buffer(eb);
2249 return;
2250 }
2251
2252 target = search;
2253
2254 nritems = btrfs_header_nritems(node);
2255 nr = slot;
2256
2257 while (1) {
2258 if (path->reada == READA_BACK) {
2259 if (nr == 0)
2260 break;
2261 nr--;
2262 } else if (path->reada == READA_FORWARD) {
2263 nr++;
2264 if (nr >= nritems)
2265 break;
2266 }
2267 if (path->reada == READA_BACK && objectid) {
2268 btrfs_node_key(node, &disk_key, nr);
2269 if (btrfs_disk_key_objectid(&disk_key) != objectid)
2270 break;
2271 }
2272 search = btrfs_node_blockptr(node, nr);
2273 if ((search <= target && target - search <= 65536) ||
2274 (search > target && search - target <= 65536)) {
Googler9398cc32022-12-02 17:21:52 +08002275 readahead_tree_block(fs_info, search);
Googleraf606d22022-10-26 21:40:12 -07002276 nread += blocksize;
2277 }
2278 nscan++;
2279 if ((nread > 65536 || nscan > 32))
2280 break;
2281 }
2282}
2283
Googler9398cc32022-12-02 17:21:52 +08002284static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
Googleraf606d22022-10-26 21:40:12 -07002285 struct btrfs_path *path, int level)
2286{
2287 int slot;
2288 int nritems;
2289 struct extent_buffer *parent;
2290 struct extent_buffer *eb;
2291 u64 gen;
2292 u64 block1 = 0;
2293 u64 block2 = 0;
2294
2295 parent = path->nodes[level + 1];
2296 if (!parent)
2297 return;
2298
2299 nritems = btrfs_header_nritems(parent);
2300 slot = path->slots[level + 1];
2301
2302 if (slot > 0) {
2303 block1 = btrfs_node_blockptr(parent, slot - 1);
2304 gen = btrfs_node_ptr_generation(parent, slot - 1);
Googler9398cc32022-12-02 17:21:52 +08002305 eb = find_extent_buffer(fs_info, block1);
Googleraf606d22022-10-26 21:40:12 -07002306 /*
2307 * if we get -eagain from btrfs_buffer_uptodate, we
2308 * don't want to return eagain here. That will loop
2309 * forever
2310 */
2311 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2312 block1 = 0;
2313 free_extent_buffer(eb);
2314 }
2315 if (slot + 1 < nritems) {
2316 block2 = btrfs_node_blockptr(parent, slot + 1);
2317 gen = btrfs_node_ptr_generation(parent, slot + 1);
Googler9398cc32022-12-02 17:21:52 +08002318 eb = find_extent_buffer(fs_info, block2);
Googleraf606d22022-10-26 21:40:12 -07002319 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2320 block2 = 0;
2321 free_extent_buffer(eb);
2322 }
2323
2324 if (block1)
Googler9398cc32022-12-02 17:21:52 +08002325 readahead_tree_block(fs_info, block1);
Googleraf606d22022-10-26 21:40:12 -07002326 if (block2)
Googler9398cc32022-12-02 17:21:52 +08002327 readahead_tree_block(fs_info, block2);
Googleraf606d22022-10-26 21:40:12 -07002328}
2329
2330
2331/*
2332 * when we walk down the tree, it is usually safe to unlock the higher layers
2333 * in the tree. The exceptions are when our path goes through slot 0, because
2334 * operations on the tree might require changing key pointers higher up in the
2335 * tree.
2336 *
2337 * callers might also have set path->keep_locks, which tells this code to keep
2338 * the lock if the path points to the last slot in the block. This is part of
2339 * walking through the tree, and selecting the next slot in the higher block.
2340 *
2341 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
2342 * if lowest_unlock is 1, level 0 won't be unlocked
2343 */
2344static noinline void unlock_up(struct btrfs_path *path, int level,
2345 int lowest_unlock, int min_write_lock_level,
2346 int *write_lock_level)
2347{
2348 int i;
2349 int skip_level = level;
2350 int no_skips = 0;
2351 struct extent_buffer *t;
2352
2353 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2354 if (!path->nodes[i])
2355 break;
2356 if (!path->locks[i])
2357 break;
2358 if (!no_skips && path->slots[i] == 0) {
2359 skip_level = i + 1;
2360 continue;
2361 }
2362 if (!no_skips && path->keep_locks) {
2363 u32 nritems;
2364 t = path->nodes[i];
2365 nritems = btrfs_header_nritems(t);
2366 if (nritems < 1 || path->slots[i] >= nritems - 1) {
2367 skip_level = i + 1;
2368 continue;
2369 }
2370 }
2371 if (skip_level < i && i >= lowest_unlock)
2372 no_skips = 1;
2373
2374 t = path->nodes[i];
Googler9398cc32022-12-02 17:21:52 +08002375 if (i >= lowest_unlock && i > skip_level) {
Googleraf606d22022-10-26 21:40:12 -07002376 btrfs_tree_unlock_rw(t, path->locks[i]);
2377 path->locks[i] = 0;
2378 if (write_lock_level &&
2379 i > min_write_lock_level &&
2380 i <= *write_lock_level) {
2381 *write_lock_level = i - 1;
2382 }
2383 }
2384 }
2385}
2386
2387/*
2388 * This releases any locks held in the path starting at level and
2389 * going all the way up to the root.
2390 *
2391 * btrfs_search_slot will keep the lock held on higher nodes in a few
2392 * corner cases, such as COW of the block at slot zero in the node. This
2393 * ignores those rules, and it should only be called when there are no
2394 * more updates to be done higher up in the tree.
2395 */
2396noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2397{
2398 int i;
2399
2400 if (path->keep_locks)
2401 return;
2402
2403 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2404 if (!path->nodes[i])
2405 continue;
2406 if (!path->locks[i])
2407 continue;
2408 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2409 path->locks[i] = 0;
2410 }
2411}
2412
2413/*
2414 * helper function for btrfs_search_slot. The goal is to find a block
2415 * in cache without setting the path to blocking. If we find the block
2416 * we return zero and the path is unchanged.
2417 *
2418 * If we can't find the block, we set the path blocking and do some
2419 * reada. -EAGAIN is returned and the search must be repeated.
2420 */
2421static int
Googler9398cc32022-12-02 17:21:52 +08002422read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2423 struct extent_buffer **eb_ret, int level, int slot,
2424 const struct btrfs_key *key)
Googleraf606d22022-10-26 21:40:12 -07002425{
Googler9398cc32022-12-02 17:21:52 +08002426 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -07002427 u64 blocknr;
2428 u64 gen;
2429 struct extent_buffer *b = *eb_ret;
2430 struct extent_buffer *tmp;
Googler9398cc32022-12-02 17:21:52 +08002431 struct btrfs_key first_key;
Googleraf606d22022-10-26 21:40:12 -07002432 int ret;
Googler9398cc32022-12-02 17:21:52 +08002433 int parent_level;
Googleraf606d22022-10-26 21:40:12 -07002434
2435 blocknr = btrfs_node_blockptr(b, slot);
2436 gen = btrfs_node_ptr_generation(b, slot);
Googler9398cc32022-12-02 17:21:52 +08002437 parent_level = btrfs_header_level(b);
2438 btrfs_node_key_to_cpu(b, &first_key, slot);
Googleraf606d22022-10-26 21:40:12 -07002439
Googler9398cc32022-12-02 17:21:52 +08002440 tmp = find_extent_buffer(fs_info, blocknr);
Googleraf606d22022-10-26 21:40:12 -07002441 if (tmp) {
2442 /* first we do an atomic uptodate check */
2443 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
Googler9398cc32022-12-02 17:21:52 +08002444 /*
2445 * Do extra check for first_key, eb can be stale due to
2446 * being cached, read from scrub, or have multiple
2447 * parents (shared tree blocks).
2448 */
2449 if (btrfs_verify_level_key(tmp,
2450 parent_level - 1, &first_key, gen)) {
2451 free_extent_buffer(tmp);
2452 return -EUCLEAN;
2453 }
Googleraf606d22022-10-26 21:40:12 -07002454 *eb_ret = tmp;
2455 return 0;
2456 }
2457
2458 /* the pages were up to date, but we failed
2459 * the generation number check. Do a full
2460 * read for the generation number that is correct.
2461 * We must do this without dropping locks so
2462 * we can trust our generation number
2463 */
2464 btrfs_set_path_blocking(p);
2465
2466 /* now we're allowed to do a blocking uptodate check */
Googler9398cc32022-12-02 17:21:52 +08002467 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
Googleraf606d22022-10-26 21:40:12 -07002468 if (!ret) {
2469 *eb_ret = tmp;
2470 return 0;
2471 }
2472 free_extent_buffer(tmp);
2473 btrfs_release_path(p);
2474 return -EIO;
2475 }
2476
2477 /*
2478 * reduce lock contention at high levels
2479 * of the btree by dropping locks before
2480 * we read. Don't release the lock on the current
2481 * level because we need to walk this node to figure
2482 * out which blocks to read.
2483 */
2484 btrfs_unlock_up_safe(p, level + 1);
2485 btrfs_set_path_blocking(p);
2486
2487 if (p->reada != READA_NONE)
Googler9398cc32022-12-02 17:21:52 +08002488 reada_for_search(fs_info, p, level, slot, key->objectid);
Googleraf606d22022-10-26 21:40:12 -07002489
2490 ret = -EAGAIN;
Googler9398cc32022-12-02 17:21:52 +08002491 tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2492 &first_key);
Googleraf606d22022-10-26 21:40:12 -07002493 if (!IS_ERR(tmp)) {
2494 /*
2495 * If the read above didn't mark this buffer up to date,
2496 * it will never end up being up to date. Set ret to EIO now
2497 * and give up so that our caller doesn't loop forever
2498 * on our EAGAINs.
2499 */
Googler9398cc32022-12-02 17:21:52 +08002500 if (!extent_buffer_uptodate(tmp))
Googleraf606d22022-10-26 21:40:12 -07002501 ret = -EIO;
2502 free_extent_buffer(tmp);
2503 } else {
2504 ret = PTR_ERR(tmp);
2505 }
2506
2507 btrfs_release_path(p);
2508 return ret;
2509}
2510
2511/*
2512 * helper function for btrfs_search_slot. This does all of the checks
2513 * for node-level blocks and does any balancing required based on
2514 * the ins_len.
2515 *
2516 * If no extra work was required, zero is returned. If we had to
2517 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2518 * start over
2519 */
2520static int
2521setup_nodes_for_search(struct btrfs_trans_handle *trans,
2522 struct btrfs_root *root, struct btrfs_path *p,
2523 struct extent_buffer *b, int level, int ins_len,
2524 int *write_lock_level)
2525{
Googler9398cc32022-12-02 17:21:52 +08002526 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -07002527 int ret;
Googler9398cc32022-12-02 17:21:52 +08002528
Googleraf606d22022-10-26 21:40:12 -07002529 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
Googler9398cc32022-12-02 17:21:52 +08002530 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
Googleraf606d22022-10-26 21:40:12 -07002531 int sret;
2532
2533 if (*write_lock_level < level + 1) {
2534 *write_lock_level = level + 1;
2535 btrfs_release_path(p);
2536 goto again;
2537 }
2538
2539 btrfs_set_path_blocking(p);
Googler9398cc32022-12-02 17:21:52 +08002540 reada_for_balance(fs_info, p, level);
Googleraf606d22022-10-26 21:40:12 -07002541 sret = split_node(trans, root, p, level);
Googleraf606d22022-10-26 21:40:12 -07002542
2543 BUG_ON(sret > 0);
2544 if (sret) {
2545 ret = sret;
2546 goto done;
2547 }
2548 b = p->nodes[level];
2549 } else if (ins_len < 0 && btrfs_header_nritems(b) <
Googler9398cc32022-12-02 17:21:52 +08002550 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
Googleraf606d22022-10-26 21:40:12 -07002551 int sret;
2552
2553 if (*write_lock_level < level + 1) {
2554 *write_lock_level = level + 1;
2555 btrfs_release_path(p);
2556 goto again;
2557 }
2558
2559 btrfs_set_path_blocking(p);
Googler9398cc32022-12-02 17:21:52 +08002560 reada_for_balance(fs_info, p, level);
Googleraf606d22022-10-26 21:40:12 -07002561 sret = balance_level(trans, root, p, level);
Googleraf606d22022-10-26 21:40:12 -07002562
2563 if (sret) {
2564 ret = sret;
2565 goto done;
2566 }
2567 b = p->nodes[level];
2568 if (!b) {
2569 btrfs_release_path(p);
2570 goto again;
2571 }
2572 BUG_ON(btrfs_header_nritems(b) == 1);
2573 }
2574 return 0;
2575
2576again:
2577 ret = -EAGAIN;
2578done:
2579 return ret;
2580}
2581
Googler9398cc32022-12-02 17:21:52 +08002582static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
Googleraf606d22022-10-26 21:40:12 -07002583 int level, int *prev_cmp, int *slot)
2584{
2585 if (*prev_cmp != 0) {
Googler9398cc32022-12-02 17:21:52 +08002586 *prev_cmp = btrfs_bin_search(b, key, level, slot);
Googleraf606d22022-10-26 21:40:12 -07002587 return *prev_cmp;
2588 }
2589
Googleraf606d22022-10-26 21:40:12 -07002590 *slot = 0;
2591
2592 return 0;
2593}
2594
2595int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2596 u64 iobjectid, u64 ioff, u8 key_type,
2597 struct btrfs_key *found_key)
2598{
2599 int ret;
2600 struct btrfs_key key;
2601 struct extent_buffer *eb;
2602
2603 ASSERT(path);
2604 ASSERT(found_key);
2605
2606 key.type = key_type;
2607 key.objectid = iobjectid;
2608 key.offset = ioff;
2609
2610 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2611 if (ret < 0)
2612 return ret;
2613
2614 eb = path->nodes[0];
2615 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2616 ret = btrfs_next_leaf(fs_root, path);
2617 if (ret)
2618 return ret;
2619 eb = path->nodes[0];
2620 }
2621
2622 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2623 if (found_key->type != key.type ||
2624 found_key->objectid != key.objectid)
2625 return 1;
2626
2627 return 0;
2628}
2629
Googler9398cc32022-12-02 17:21:52 +08002630static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2631 struct btrfs_path *p,
2632 int write_lock_level)
2633{
2634 struct btrfs_fs_info *fs_info = root->fs_info;
2635 struct extent_buffer *b;
Googlerb48fa912023-03-17 12:40:29 +05302636 int root_lock;
Googler9398cc32022-12-02 17:21:52 +08002637 int level = 0;
2638
Googlerb48fa912023-03-17 12:40:29 +05302639 /* We try very hard to do read locks on the root */
2640 root_lock = BTRFS_READ_LOCK;
2641
Googler9398cc32022-12-02 17:21:52 +08002642 if (p->search_commit_root) {
2643 /*
2644 * The commit roots are read only so we always do read locks,
2645 * and we always must hold the commit_root_sem when doing
2646 * searches on them, the only exception is send where we don't
2647 * want to block transaction commits for a long time, so
2648 * we need to clone the commit root in order to avoid races
2649 * with transaction commits that create a snapshot of one of
2650 * the roots used by a send operation.
2651 */
2652 if (p->need_commit_sem) {
2653 down_read(&fs_info->commit_root_sem);
2654 b = btrfs_clone_extent_buffer(root->commit_root);
2655 up_read(&fs_info->commit_root_sem);
2656 if (!b)
2657 return ERR_PTR(-ENOMEM);
2658
2659 } else {
2660 b = root->commit_root;
2661 extent_buffer_get(b);
2662 }
2663 level = btrfs_header_level(b);
2664 /*
2665 * Ensure that all callers have set skip_locking when
2666 * p->search_commit_root = 1.
2667 */
2668 ASSERT(p->skip_locking == 1);
2669
2670 goto out;
2671 }
2672
2673 if (p->skip_locking) {
2674 b = btrfs_root_node(root);
2675 level = btrfs_header_level(b);
2676 goto out;
2677 }
2678
Googler9398cc32022-12-02 17:21:52 +08002679 /*
2680 * If the level is set to maximum, we can skip trying to get the read
2681 * lock.
2682 */
2683 if (write_lock_level < BTRFS_MAX_LEVEL) {
2684 /*
2685 * We don't know the level of the root node until we actually
2686 * have it read locked
2687 */
2688 b = btrfs_read_lock_root_node(root);
2689 level = btrfs_header_level(b);
2690 if (level > write_lock_level)
2691 goto out;
2692
2693 /* Whoops, must trade for write lock */
2694 btrfs_tree_read_unlock(b);
2695 free_extent_buffer(b);
2696 }
2697
2698 b = btrfs_lock_root_node(root);
2699 root_lock = BTRFS_WRITE_LOCK;
2700
2701 /* The level might have changed, check again */
2702 level = btrfs_header_level(b);
2703
2704out:
Googler9398cc32022-12-02 17:21:52 +08002705 p->nodes[level] = b;
2706 if (!p->skip_locking)
2707 p->locks[level] = root_lock;
2708 /*
2709 * Callers are responsible for dropping b's references.
2710 */
2711 return b;
2712}
2713
2714
Googler0109c452022-10-13 17:50:39 +08002715/*
Googler9398cc32022-12-02 17:21:52 +08002716 * btrfs_search_slot - look for a key in a tree and perform necessary
2717 * modifications to preserve tree invariants.
Googler0109c452022-10-13 17:50:39 +08002718 *
Googler9398cc32022-12-02 17:21:52 +08002719 * @trans: Handle of transaction, used when modifying the tree
2720 * @p: Holds all btree nodes along the search path
2721 * @root: The root node of the tree
2722 * @key: The key we are looking for
2723 * @ins_len: Indicates purpose of search, for inserts it is 1, for
2724 * deletions it's -1. 0 for plain searches
2725 * @cow: boolean should CoW operations be performed. Must always be 1
2726 * when modifying the tree.
Googler0109c452022-10-13 17:50:39 +08002727 *
Googler9398cc32022-12-02 17:21:52 +08002728 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2729 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2730 *
2731 * If @key is found, 0 is returned and you can find the item in the leaf level
2732 * of the path (level 0)
2733 *
2734 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2735 * points to the slot where it should be inserted
2736 *
2737 * If an error is encountered while searching the tree a negative error number
2738 * is returned
Googler0109c452022-10-13 17:50:39 +08002739 */
Googler9398cc32022-12-02 17:21:52 +08002740int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2741 const struct btrfs_key *key, struct btrfs_path *p,
2742 int ins_len, int cow)
Googler0109c452022-10-13 17:50:39 +08002743{
Googleraf606d22022-10-26 21:40:12 -07002744 struct extent_buffer *b;
2745 int slot;
2746 int ret;
2747 int err;
2748 int level;
2749 int lowest_unlock = 1;
2750 /* everything at write_lock_level or lower must be write locked */
2751 int write_lock_level = 0;
2752 u8 lowest_level = 0;
2753 int min_write_lock_level;
2754 int prev_cmp;
2755
2756 lowest_level = p->lowest_level;
2757 WARN_ON(lowest_level && ins_len > 0);
2758 WARN_ON(p->nodes[0] != NULL);
2759 BUG_ON(!cow && ins_len);
2760
2761 if (ins_len < 0) {
2762 lowest_unlock = 2;
2763
2764 /* when we are removing items, we might have to go up to level
2765 * two as we update tree pointers Make sure we keep write
2766 * for those levels as well
2767 */
2768 write_lock_level = 2;
2769 } else if (ins_len > 0) {
2770 /*
2771 * for inserting items, make sure we have a write lock on
2772 * level 1 so we can update keys
2773 */
2774 write_lock_level = 1;
2775 }
2776
2777 if (!cow)
2778 write_lock_level = -1;
2779
2780 if (cow && (p->keep_locks || p->lowest_level))
2781 write_lock_level = BTRFS_MAX_LEVEL;
2782
2783 min_write_lock_level = write_lock_level;
2784
2785again:
2786 prev_cmp = -1;
Googler9398cc32022-12-02 17:21:52 +08002787 b = btrfs_search_slot_get_root(root, p, write_lock_level);
2788 if (IS_ERR(b)) {
2789 ret = PTR_ERR(b);
2790 goto done;
Googleraf606d22022-10-26 21:40:12 -07002791 }
2792
2793 while (b) {
2794 level = btrfs_header_level(b);
2795
2796 /*
2797 * setup the path here so we can release it under lock
2798 * contention with the cow code
2799 */
2800 if (cow) {
2801 bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2802
2803 /*
2804 * if we don't really need to cow this block
2805 * then we don't want to set the path blocking,
2806 * so we test it here
2807 */
2808 if (!should_cow_block(trans, root, b)) {
2809 trans->dirty = true;
2810 goto cow_done;
2811 }
2812
2813 /*
2814 * must have write locks on this node and the
2815 * parent
2816 */
2817 if (level > write_lock_level ||
2818 (level + 1 > write_lock_level &&
2819 level + 1 < BTRFS_MAX_LEVEL &&
2820 p->nodes[level + 1])) {
2821 write_lock_level = level + 1;
2822 btrfs_release_path(p);
2823 goto again;
2824 }
2825
2826 btrfs_set_path_blocking(p);
2827 if (last_level)
2828 err = btrfs_cow_block(trans, root, b, NULL, 0,
2829 &b);
2830 else
2831 err = btrfs_cow_block(trans, root, b,
2832 p->nodes[level + 1],
2833 p->slots[level + 1], &b);
2834 if (err) {
2835 ret = err;
2836 goto done;
2837 }
2838 }
2839cow_done:
2840 p->nodes[level] = b;
Googler9398cc32022-12-02 17:21:52 +08002841 /*
2842 * Leave path with blocking locks to avoid massive
2843 * lock context switch, this is made on purpose.
2844 */
Googleraf606d22022-10-26 21:40:12 -07002845
2846 /*
2847 * we have a lock on b and as long as we aren't changing
2848 * the tree, there is no way to for the items in b to change.
2849 * It is safe to drop the lock on our parent before we
2850 * go through the expensive btree search on b.
2851 *
2852 * If we're inserting or deleting (ins_len != 0), then we might
2853 * be changing slot zero, which may require changing the parent.
2854 * So, we can't drop the lock until after we know which slot
2855 * we're operating on.
2856 */
2857 if (!ins_len && !p->keep_locks) {
2858 int u = level + 1;
2859
2860 if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2861 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2862 p->locks[u] = 0;
2863 }
2864 }
2865
2866 ret = key_search(b, key, level, &prev_cmp, &slot);
2867 if (ret < 0)
2868 goto done;
2869
2870 if (level != 0) {
2871 int dec = 0;
2872 if (ret && slot > 0) {
2873 dec = 1;
2874 slot -= 1;
2875 }
2876 p->slots[level] = slot;
2877 err = setup_nodes_for_search(trans, root, p, b, level,
2878 ins_len, &write_lock_level);
2879 if (err == -EAGAIN)
2880 goto again;
2881 if (err) {
2882 ret = err;
2883 goto done;
2884 }
2885 b = p->nodes[level];
2886 slot = p->slots[level];
2887
2888 /*
2889 * slot 0 is special, if we change the key
2890 * we have to update the parent pointer
2891 * which means we must have a write lock
2892 * on the parent
2893 */
2894 if (slot == 0 && ins_len &&
2895 write_lock_level < level + 1) {
2896 write_lock_level = level + 1;
2897 btrfs_release_path(p);
2898 goto again;
2899 }
2900
2901 unlock_up(p, level, lowest_unlock,
2902 min_write_lock_level, &write_lock_level);
2903
2904 if (level == lowest_level) {
2905 if (dec)
2906 p->slots[level]++;
2907 goto done;
2908 }
2909
Googler9398cc32022-12-02 17:21:52 +08002910 err = read_block_for_search(root, p, &b, level,
2911 slot, key);
Googleraf606d22022-10-26 21:40:12 -07002912 if (err == -EAGAIN)
2913 goto again;
2914 if (err) {
2915 ret = err;
2916 goto done;
2917 }
2918
2919 if (!p->skip_locking) {
2920 level = btrfs_header_level(b);
2921 if (level <= write_lock_level) {
Googler9398cc32022-12-02 17:21:52 +08002922 if (!btrfs_try_tree_write_lock(b)) {
Googleraf606d22022-10-26 21:40:12 -07002923 btrfs_set_path_blocking(p);
2924 btrfs_tree_lock(b);
Googleraf606d22022-10-26 21:40:12 -07002925 }
2926 p->locks[level] = BTRFS_WRITE_LOCK;
2927 } else {
Googler9398cc32022-12-02 17:21:52 +08002928 if (!btrfs_tree_read_lock_atomic(b)) {
Googleraf606d22022-10-26 21:40:12 -07002929 btrfs_set_path_blocking(p);
2930 btrfs_tree_read_lock(b);
Googleraf606d22022-10-26 21:40:12 -07002931 }
2932 p->locks[level] = BTRFS_READ_LOCK;
2933 }
2934 p->nodes[level] = b;
2935 }
2936 } else {
2937 p->slots[level] = slot;
2938 if (ins_len > 0 &&
Googler9398cc32022-12-02 17:21:52 +08002939 btrfs_leaf_free_space(b) < ins_len) {
Googleraf606d22022-10-26 21:40:12 -07002940 if (write_lock_level < 1) {
2941 write_lock_level = 1;
2942 btrfs_release_path(p);
2943 goto again;
2944 }
2945
2946 btrfs_set_path_blocking(p);
2947 err = split_leaf(trans, root, key,
2948 p, ins_len, ret == 0);
Googleraf606d22022-10-26 21:40:12 -07002949
2950 BUG_ON(err > 0);
2951 if (err) {
2952 ret = err;
2953 goto done;
2954 }
2955 }
2956 if (!p->search_for_split)
2957 unlock_up(p, level, lowest_unlock,
Googler9398cc32022-12-02 17:21:52 +08002958 min_write_lock_level, NULL);
Googleraf606d22022-10-26 21:40:12 -07002959 goto done;
2960 }
2961 }
2962 ret = 1;
2963done:
2964 /*
2965 * we don't really know what they plan on doing with the path
2966 * from here on, so for now just mark it as blocking
2967 */
2968 if (!p->leave_spinning)
2969 btrfs_set_path_blocking(p);
2970 if (ret < 0 && !p->skip_release_on_error)
2971 btrfs_release_path(p);
2972 return ret;
2973}
2974
2975/*
2976 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2977 * current state of the tree together with the operations recorded in the tree
2978 * modification log to search for the key in a previous version of this tree, as
2979 * denoted by the time_seq parameter.
2980 *
2981 * Naturally, there is no support for insert, delete or cow operations.
2982 *
2983 * The resulting path and return value will be set up as if we called
2984 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2985 */
Googler9398cc32022-12-02 17:21:52 +08002986int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
Googleraf606d22022-10-26 21:40:12 -07002987 struct btrfs_path *p, u64 time_seq)
2988{
Googler9398cc32022-12-02 17:21:52 +08002989 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -07002990 struct extent_buffer *b;
2991 int slot;
2992 int ret;
2993 int err;
2994 int level;
2995 int lowest_unlock = 1;
2996 u8 lowest_level = 0;
2997 int prev_cmp = -1;
2998
2999 lowest_level = p->lowest_level;
3000 WARN_ON(p->nodes[0] != NULL);
3001
3002 if (p->search_commit_root) {
3003 BUG_ON(time_seq);
3004 return btrfs_search_slot(NULL, root, key, p, 0, 0);
3005 }
3006
3007again:
3008 b = get_old_root(root, time_seq);
3009 if (!b) {
3010 ret = -EIO;
3011 goto done;
3012 }
3013 level = btrfs_header_level(b);
3014 p->locks[level] = BTRFS_READ_LOCK;
3015
3016 while (b) {
3017 level = btrfs_header_level(b);
3018 p->nodes[level] = b;
Googleraf606d22022-10-26 21:40:12 -07003019
3020 /*
3021 * we have a lock on b and as long as we aren't changing
3022 * the tree, there is no way to for the items in b to change.
3023 * It is safe to drop the lock on our parent before we
3024 * go through the expensive btree search on b.
3025 */
3026 btrfs_unlock_up_safe(p, level + 1);
3027
3028 /*
3029 * Since we can unwind ebs we want to do a real search every
3030 * time.
3031 */
3032 prev_cmp = -1;
3033 ret = key_search(b, key, level, &prev_cmp, &slot);
Googler9398cc32022-12-02 17:21:52 +08003034 if (ret < 0)
3035 goto done;
Googleraf606d22022-10-26 21:40:12 -07003036
3037 if (level != 0) {
3038 int dec = 0;
3039 if (ret && slot > 0) {
3040 dec = 1;
3041 slot -= 1;
3042 }
3043 p->slots[level] = slot;
3044 unlock_up(p, level, lowest_unlock, 0, NULL);
3045
3046 if (level == lowest_level) {
3047 if (dec)
3048 p->slots[level]++;
3049 goto done;
3050 }
3051
Googler9398cc32022-12-02 17:21:52 +08003052 err = read_block_for_search(root, p, &b, level,
3053 slot, key);
Googleraf606d22022-10-26 21:40:12 -07003054 if (err == -EAGAIN)
3055 goto again;
3056 if (err) {
3057 ret = err;
3058 goto done;
3059 }
3060
3061 level = btrfs_header_level(b);
Googler9398cc32022-12-02 17:21:52 +08003062 if (!btrfs_tree_read_lock_atomic(b)) {
Googleraf606d22022-10-26 21:40:12 -07003063 btrfs_set_path_blocking(p);
3064 btrfs_tree_read_lock(b);
Googleraf606d22022-10-26 21:40:12 -07003065 }
Googler9398cc32022-12-02 17:21:52 +08003066 b = tree_mod_log_rewind(fs_info, p, b, time_seq);
Googleraf606d22022-10-26 21:40:12 -07003067 if (!b) {
3068 ret = -ENOMEM;
3069 goto done;
3070 }
3071 p->locks[level] = BTRFS_READ_LOCK;
3072 p->nodes[level] = b;
3073 } else {
3074 p->slots[level] = slot;
3075 unlock_up(p, level, lowest_unlock, 0, NULL);
3076 goto done;
3077 }
3078 }
3079 ret = 1;
3080done:
3081 if (!p->leave_spinning)
3082 btrfs_set_path_blocking(p);
3083 if (ret < 0)
3084 btrfs_release_path(p);
3085
3086 return ret;
3087}
3088
3089/*
3090 * helper to use instead of search slot if no exact match is needed but
3091 * instead the next or previous item should be returned.
3092 * When find_higher is true, the next higher item is returned, the next lower
3093 * otherwise.
3094 * When return_any and find_higher are both true, and no higher item is found,
3095 * return the next lower instead.
3096 * When return_any is true and find_higher is false, and no lower item is found,
3097 * return the next higher instead.
3098 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3099 * < 0 on error
3100 */
3101int btrfs_search_slot_for_read(struct btrfs_root *root,
Googler9398cc32022-12-02 17:21:52 +08003102 const struct btrfs_key *key,
3103 struct btrfs_path *p, int find_higher,
3104 int return_any)
Googleraf606d22022-10-26 21:40:12 -07003105{
3106 int ret;
3107 struct extent_buffer *leaf;
3108
3109again:
3110 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3111 if (ret <= 0)
3112 return ret;
3113 /*
3114 * a return value of 1 means the path is at the position where the
3115 * item should be inserted. Normally this is the next bigger item,
3116 * but in case the previous item is the last in a leaf, path points
3117 * to the first free slot in the previous leaf, i.e. at an invalid
3118 * item.
3119 */
3120 leaf = p->nodes[0];
3121
3122 if (find_higher) {
3123 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3124 ret = btrfs_next_leaf(root, p);
3125 if (ret <= 0)
3126 return ret;
3127 if (!return_any)
3128 return 1;
3129 /*
3130 * no higher item found, return the next
3131 * lower instead
3132 */
3133 return_any = 0;
3134 find_higher = 0;
3135 btrfs_release_path(p);
3136 goto again;
3137 }
3138 } else {
3139 if (p->slots[0] == 0) {
3140 ret = btrfs_prev_leaf(root, p);
3141 if (ret < 0)
3142 return ret;
3143 if (!ret) {
3144 leaf = p->nodes[0];
3145 if (p->slots[0] == btrfs_header_nritems(leaf))
3146 p->slots[0]--;
3147 return 0;
3148 }
3149 if (!return_any)
3150 return 1;
3151 /*
3152 * no lower item found, return the next
3153 * higher instead
3154 */
3155 return_any = 0;
3156 find_higher = 1;
3157 btrfs_release_path(p);
3158 goto again;
3159 } else {
3160 --p->slots[0];
3161 }
3162 }
3163 return 0;
3164}
3165
3166/*
3167 * adjust the pointers going up the tree, starting at level
3168 * making sure the right key of each node is points to 'key'.
3169 * This is used after shifting pointers to the left, so it stops
3170 * fixing up pointers when a given leaf/node is not in slot 0 of the
3171 * higher levels
3172 *
3173 */
Googler9398cc32022-12-02 17:21:52 +08003174static void fixup_low_keys(struct btrfs_path *path,
Googleraf606d22022-10-26 21:40:12 -07003175 struct btrfs_disk_key *key, int level)
3176{
3177 int i;
3178 struct extent_buffer *t;
Googler9398cc32022-12-02 17:21:52 +08003179 int ret;
Googleraf606d22022-10-26 21:40:12 -07003180
3181 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3182 int tslot = path->slots[i];
Googler9398cc32022-12-02 17:21:52 +08003183
Googleraf606d22022-10-26 21:40:12 -07003184 if (!path->nodes[i])
3185 break;
3186 t = path->nodes[i];
Googler9398cc32022-12-02 17:21:52 +08003187 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3188 GFP_ATOMIC);
3189 BUG_ON(ret < 0);
Googleraf606d22022-10-26 21:40:12 -07003190 btrfs_set_node_key(t, key, tslot);
3191 btrfs_mark_buffer_dirty(path->nodes[i]);
3192 if (tslot != 0)
3193 break;
3194 }
3195}
3196
3197/*
3198 * update item key.
3199 *
3200 * This function isn't completely safe. It's the caller's responsibility
3201 * that the new key won't break the order
3202 */
3203void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3204 struct btrfs_path *path,
Googler9398cc32022-12-02 17:21:52 +08003205 const struct btrfs_key *new_key)
Googleraf606d22022-10-26 21:40:12 -07003206{
3207 struct btrfs_disk_key disk_key;
3208 struct extent_buffer *eb;
3209 int slot;
3210
3211 eb = path->nodes[0];
3212 slot = path->slots[0];
3213 if (slot > 0) {
3214 btrfs_item_key(eb, &disk_key, slot - 1);
Googler9398cc32022-12-02 17:21:52 +08003215 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
3216 btrfs_crit(fs_info,
3217 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3218 slot, btrfs_disk_key_objectid(&disk_key),
3219 btrfs_disk_key_type(&disk_key),
3220 btrfs_disk_key_offset(&disk_key),
3221 new_key->objectid, new_key->type,
3222 new_key->offset);
3223 btrfs_print_leaf(eb);
3224 BUG();
3225 }
Googleraf606d22022-10-26 21:40:12 -07003226 }
3227 if (slot < btrfs_header_nritems(eb) - 1) {
3228 btrfs_item_key(eb, &disk_key, slot + 1);
Googler9398cc32022-12-02 17:21:52 +08003229 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
3230 btrfs_crit(fs_info,
3231 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3232 slot, btrfs_disk_key_objectid(&disk_key),
3233 btrfs_disk_key_type(&disk_key),
3234 btrfs_disk_key_offset(&disk_key),
3235 new_key->objectid, new_key->type,
3236 new_key->offset);
3237 btrfs_print_leaf(eb);
3238 BUG();
3239 }
Googleraf606d22022-10-26 21:40:12 -07003240 }
3241
3242 btrfs_cpu_key_to_disk(&disk_key, new_key);
3243 btrfs_set_item_key(eb, &disk_key, slot);
3244 btrfs_mark_buffer_dirty(eb);
3245 if (slot == 0)
Googler9398cc32022-12-02 17:21:52 +08003246 fixup_low_keys(path, &disk_key, 1);
Googleraf606d22022-10-26 21:40:12 -07003247}
3248
3249/*
3250 * try to push data from one node into the next node left in the
3251 * tree.
3252 *
3253 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3254 * error, and > 0 if there was no room in the left hand block.
3255 */
3256static int push_node_left(struct btrfs_trans_handle *trans,
Googler9398cc32022-12-02 17:21:52 +08003257 struct extent_buffer *dst,
Googleraf606d22022-10-26 21:40:12 -07003258 struct extent_buffer *src, int empty)
3259{
Googler9398cc32022-12-02 17:21:52 +08003260 struct btrfs_fs_info *fs_info = trans->fs_info;
Googleraf606d22022-10-26 21:40:12 -07003261 int push_items = 0;
3262 int src_nritems;
3263 int dst_nritems;
3264 int ret = 0;
3265
3266 src_nritems = btrfs_header_nritems(src);
3267 dst_nritems = btrfs_header_nritems(dst);
Googler9398cc32022-12-02 17:21:52 +08003268 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
Googleraf606d22022-10-26 21:40:12 -07003269 WARN_ON(btrfs_header_generation(src) != trans->transid);
3270 WARN_ON(btrfs_header_generation(dst) != trans->transid);
3271
3272 if (!empty && src_nritems <= 8)
3273 return 1;
3274
3275 if (push_items <= 0)
3276 return 1;
3277
3278 if (empty) {
3279 push_items = min(src_nritems, push_items);
3280 if (push_items < src_nritems) {
3281 /* leave at least 8 pointers in the node if
3282 * we aren't going to empty it
3283 */
3284 if (src_nritems - push_items < 8) {
3285 if (push_items <= 8)
3286 return 1;
3287 push_items -= 8;
3288 }
3289 }
3290 } else
3291 push_items = min(src_nritems - 8, push_items);
3292
Googler9398cc32022-12-02 17:21:52 +08003293 ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
Googleraf606d22022-10-26 21:40:12 -07003294 if (ret) {
3295 btrfs_abort_transaction(trans, ret);
3296 return ret;
3297 }
3298 copy_extent_buffer(dst, src,
3299 btrfs_node_key_ptr_offset(dst_nritems),
3300 btrfs_node_key_ptr_offset(0),
3301 push_items * sizeof(struct btrfs_key_ptr));
3302
3303 if (push_items < src_nritems) {
3304 /*
Googler9398cc32022-12-02 17:21:52 +08003305 * Don't call tree_mod_log_insert_move here, key removal was
3306 * already fully logged by tree_mod_log_eb_copy above.
Googleraf606d22022-10-26 21:40:12 -07003307 */
3308 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3309 btrfs_node_key_ptr_offset(push_items),
3310 (src_nritems - push_items) *
3311 sizeof(struct btrfs_key_ptr));
3312 }
3313 btrfs_set_header_nritems(src, src_nritems - push_items);
3314 btrfs_set_header_nritems(dst, dst_nritems + push_items);
3315 btrfs_mark_buffer_dirty(src);
3316 btrfs_mark_buffer_dirty(dst);
3317
3318 return ret;
3319}
3320
3321/*
3322 * try to push data from one node into the next node right in the
3323 * tree.
3324 *
3325 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3326 * error, and > 0 if there was no room in the right hand block.
3327 *
3328 * this will only push up to 1/2 the contents of the left node over
3329 */
3330static int balance_node_right(struct btrfs_trans_handle *trans,
Googleraf606d22022-10-26 21:40:12 -07003331 struct extent_buffer *dst,
3332 struct extent_buffer *src)
3333{
Googler9398cc32022-12-02 17:21:52 +08003334 struct btrfs_fs_info *fs_info = trans->fs_info;
Googleraf606d22022-10-26 21:40:12 -07003335 int push_items = 0;
3336 int max_push;
3337 int src_nritems;
3338 int dst_nritems;
3339 int ret = 0;
3340
3341 WARN_ON(btrfs_header_generation(src) != trans->transid);
3342 WARN_ON(btrfs_header_generation(dst) != trans->transid);
3343
3344 src_nritems = btrfs_header_nritems(src);
3345 dst_nritems = btrfs_header_nritems(dst);
Googler9398cc32022-12-02 17:21:52 +08003346 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
Googleraf606d22022-10-26 21:40:12 -07003347 if (push_items <= 0)
3348 return 1;
3349
3350 if (src_nritems < 4)
3351 return 1;
3352
3353 max_push = src_nritems / 2 + 1;
3354 /* don't try to empty the node */
3355 if (max_push >= src_nritems)
3356 return 1;
3357
3358 if (max_push < push_items)
3359 push_items = max_push;
3360
Googler9398cc32022-12-02 17:21:52 +08003361 ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3362 BUG_ON(ret < 0);
Googleraf606d22022-10-26 21:40:12 -07003363 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3364 btrfs_node_key_ptr_offset(0),
3365 (dst_nritems) *
3366 sizeof(struct btrfs_key_ptr));
3367
Googler9398cc32022-12-02 17:21:52 +08003368 ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
3369 push_items);
Googleraf606d22022-10-26 21:40:12 -07003370 if (ret) {
3371 btrfs_abort_transaction(trans, ret);
3372 return ret;
3373 }
3374 copy_extent_buffer(dst, src,
3375 btrfs_node_key_ptr_offset(0),
3376 btrfs_node_key_ptr_offset(src_nritems - push_items),
3377 push_items * sizeof(struct btrfs_key_ptr));
3378
3379 btrfs_set_header_nritems(src, src_nritems - push_items);
3380 btrfs_set_header_nritems(dst, dst_nritems + push_items);
3381
3382 btrfs_mark_buffer_dirty(src);
3383 btrfs_mark_buffer_dirty(dst);
3384
3385 return ret;
3386}
3387
3388/*
3389 * helper function to insert a new root level in the tree.
3390 * A new node is allocated, and a single item is inserted to
3391 * point to the existing root
3392 *
3393 * returns zero on success or < 0 on failure.
3394 */
3395static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3396 struct btrfs_root *root,
3397 struct btrfs_path *path, int level)
3398{
Googler9398cc32022-12-02 17:21:52 +08003399 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -07003400 u64 lower_gen;
3401 struct extent_buffer *lower;
3402 struct extent_buffer *c;
3403 struct extent_buffer *old;
3404 struct btrfs_disk_key lower_key;
Googler9398cc32022-12-02 17:21:52 +08003405 int ret;
Googleraf606d22022-10-26 21:40:12 -07003406
3407 BUG_ON(path->nodes[level]);
3408 BUG_ON(path->nodes[level-1] != root->node);
3409
3410 lower = path->nodes[level-1];
3411 if (level == 1)
3412 btrfs_item_key(lower, &lower_key, 0);
3413 else
3414 btrfs_node_key(lower, &lower_key, 0);
3415
Googler9398cc32022-12-02 17:21:52 +08003416 c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3417 root->node->start, 0);
Googleraf606d22022-10-26 21:40:12 -07003418 if (IS_ERR(c))
3419 return PTR_ERR(c);
3420
Googler9398cc32022-12-02 17:21:52 +08003421 root_add_used(root, fs_info->nodesize);
Googleraf606d22022-10-26 21:40:12 -07003422
3423 btrfs_set_header_nritems(c, 1);
3424 btrfs_set_node_key(c, &lower_key, 0);
3425 btrfs_set_node_blockptr(c, 0, lower->start);
3426 lower_gen = btrfs_header_generation(lower);
3427 WARN_ON(lower_gen != trans->transid);
3428
3429 btrfs_set_node_ptr_generation(c, 0, lower_gen);
3430
3431 btrfs_mark_buffer_dirty(c);
3432
3433 old = root->node;
Googler9398cc32022-12-02 17:21:52 +08003434 ret = tree_mod_log_insert_root(root->node, c, 0);
3435 BUG_ON(ret < 0);
Googleraf606d22022-10-26 21:40:12 -07003436 rcu_assign_pointer(root->node, c);
3437
3438 /* the super has an extra ref to root->node */
3439 free_extent_buffer(old);
3440
3441 add_root_to_dirty_list(root);
3442 extent_buffer_get(c);
3443 path->nodes[level] = c;
3444 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3445 path->slots[level] = 0;
3446 return 0;
3447}
3448
3449/*
3450 * worker function to insert a single pointer in a node.
3451 * the node should have enough room for the pointer already
3452 *
3453 * slot and level indicate where you want the key to go, and
3454 * blocknr is the block the key points to.
3455 */
3456static void insert_ptr(struct btrfs_trans_handle *trans,
Googler9398cc32022-12-02 17:21:52 +08003457 struct btrfs_path *path,
Googleraf606d22022-10-26 21:40:12 -07003458 struct btrfs_disk_key *key, u64 bytenr,
3459 int slot, int level)
3460{
3461 struct extent_buffer *lower;
3462 int nritems;
3463 int ret;
3464
3465 BUG_ON(!path->nodes[level]);
3466 btrfs_assert_tree_locked(path->nodes[level]);
3467 lower = path->nodes[level];
3468 nritems = btrfs_header_nritems(lower);
3469 BUG_ON(slot > nritems);
Googler9398cc32022-12-02 17:21:52 +08003470 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
Googleraf606d22022-10-26 21:40:12 -07003471 if (slot != nritems) {
Googler9398cc32022-12-02 17:21:52 +08003472 if (level) {
3473 ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3474 nritems - slot);
3475 BUG_ON(ret < 0);
3476 }
Googleraf606d22022-10-26 21:40:12 -07003477 memmove_extent_buffer(lower,
3478 btrfs_node_key_ptr_offset(slot + 1),
3479 btrfs_node_key_ptr_offset(slot),
3480 (nritems - slot) * sizeof(struct btrfs_key_ptr));
3481 }
3482 if (level) {
Googler9398cc32022-12-02 17:21:52 +08003483 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3484 GFP_NOFS);
Googleraf606d22022-10-26 21:40:12 -07003485 BUG_ON(ret < 0);
3486 }
3487 btrfs_set_node_key(lower, key, slot);
3488 btrfs_set_node_blockptr(lower, slot, bytenr);
3489 WARN_ON(trans->transid == 0);
3490 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3491 btrfs_set_header_nritems(lower, nritems + 1);
3492 btrfs_mark_buffer_dirty(lower);
3493}
3494
3495/*
3496 * split the node at the specified level in path in two.
3497 * The path is corrected to point to the appropriate node after the split
3498 *
3499 * Before splitting this tries to make some room in the node by pushing
3500 * left and right, if either one works, it returns right away.
3501 *
3502 * returns 0 on success and < 0 on failure
3503 */
3504static noinline int split_node(struct btrfs_trans_handle *trans,
3505 struct btrfs_root *root,
3506 struct btrfs_path *path, int level)
3507{
Googler9398cc32022-12-02 17:21:52 +08003508 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -07003509 struct extent_buffer *c;
3510 struct extent_buffer *split;
3511 struct btrfs_disk_key disk_key;
3512 int mid;
3513 int ret;
3514 u32 c_nritems;
3515
3516 c = path->nodes[level];
3517 WARN_ON(btrfs_header_generation(c) != trans->transid);
3518 if (c == root->node) {
3519 /*
3520 * trying to split the root, lets make a new one
3521 *
3522 * tree mod log: We don't log_removal old root in
3523 * insert_new_root, because that root buffer will be kept as a
3524 * normal node. We are going to log removal of half of the
3525 * elements below with tree_mod_log_eb_copy. We're holding a
3526 * tree lock on the buffer, which is why we cannot race with
3527 * other tree_mod_log users.
3528 */
3529 ret = insert_new_root(trans, root, path, level + 1);
3530 if (ret)
3531 return ret;
3532 } else {
3533 ret = push_nodes_for_insert(trans, root, path, level);
3534 c = path->nodes[level];
3535 if (!ret && btrfs_header_nritems(c) <
Googler9398cc32022-12-02 17:21:52 +08003536 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
Googleraf606d22022-10-26 21:40:12 -07003537 return 0;
3538 if (ret < 0)
3539 return ret;
3540 }
3541
3542 c_nritems = btrfs_header_nritems(c);
3543 mid = (c_nritems + 1) / 2;
3544 btrfs_node_key(c, &disk_key, mid);
3545
Googler9398cc32022-12-02 17:21:52 +08003546 split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3547 c->start, 0);
Googleraf606d22022-10-26 21:40:12 -07003548 if (IS_ERR(split))
3549 return PTR_ERR(split);
3550
Googler9398cc32022-12-02 17:21:52 +08003551 root_add_used(root, fs_info->nodesize);
3552 ASSERT(btrfs_header_level(c) == level);
Googleraf606d22022-10-26 21:40:12 -07003553
Googler9398cc32022-12-02 17:21:52 +08003554 ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
Googleraf606d22022-10-26 21:40:12 -07003555 if (ret) {
3556 btrfs_abort_transaction(trans, ret);
3557 return ret;
3558 }
3559 copy_extent_buffer(split, c,
3560 btrfs_node_key_ptr_offset(0),
3561 btrfs_node_key_ptr_offset(mid),
3562 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3563 btrfs_set_header_nritems(split, c_nritems - mid);
3564 btrfs_set_header_nritems(c, mid);
3565 ret = 0;
3566
3567 btrfs_mark_buffer_dirty(c);
3568 btrfs_mark_buffer_dirty(split);
3569
Googler9398cc32022-12-02 17:21:52 +08003570 insert_ptr(trans, path, &disk_key, split->start,
Googleraf606d22022-10-26 21:40:12 -07003571 path->slots[level + 1] + 1, level + 1);
3572
3573 if (path->slots[level] >= mid) {
3574 path->slots[level] -= mid;
3575 btrfs_tree_unlock(c);
3576 free_extent_buffer(c);
3577 path->nodes[level] = split;
3578 path->slots[level + 1] += 1;
3579 } else {
3580 btrfs_tree_unlock(split);
3581 free_extent_buffer(split);
3582 }
3583 return ret;
3584}
3585
3586/*
3587 * how many bytes are required to store the items in a leaf. start
3588 * and nr indicate which items in the leaf to check. This totals up the
3589 * space used both by the item structs and the item data
3590 */
3591static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3592{
3593 struct btrfs_item *start_item;
3594 struct btrfs_item *end_item;
3595 struct btrfs_map_token token;
3596 int data_len;
3597 int nritems = btrfs_header_nritems(l);
3598 int end = min(nritems, start + nr) - 1;
3599
3600 if (!nr)
3601 return 0;
Googler9398cc32022-12-02 17:21:52 +08003602 btrfs_init_map_token(&token, l);
Googleraf606d22022-10-26 21:40:12 -07003603 start_item = btrfs_item_nr(start);
3604 end_item = btrfs_item_nr(end);
3605 data_len = btrfs_token_item_offset(l, start_item, &token) +
3606 btrfs_token_item_size(l, start_item, &token);
3607 data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3608 data_len += sizeof(struct btrfs_item) * nr;
3609 WARN_ON(data_len < 0);
3610 return data_len;
3611}
3612
3613/*
3614 * The space between the end of the leaf items and
3615 * the start of the leaf data. IOW, how much room
3616 * the leaf has left for both items and data
3617 */
Googler9398cc32022-12-02 17:21:52 +08003618noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
Googleraf606d22022-10-26 21:40:12 -07003619{
Googler9398cc32022-12-02 17:21:52 +08003620 struct btrfs_fs_info *fs_info = leaf->fs_info;
Googleraf606d22022-10-26 21:40:12 -07003621 int nritems = btrfs_header_nritems(leaf);
3622 int ret;
Googler9398cc32022-12-02 17:21:52 +08003623
3624 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
Googleraf606d22022-10-26 21:40:12 -07003625 if (ret < 0) {
Googler9398cc32022-12-02 17:21:52 +08003626 btrfs_crit(fs_info,
3627 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3628 ret,
3629 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3630 leaf_space_used(leaf, 0, nritems), nritems);
Googleraf606d22022-10-26 21:40:12 -07003631 }
3632 return ret;
3633}
3634
3635/*
3636 * min slot controls the lowest index we're willing to push to the
3637 * right. We'll push up to and including min_slot, but no lower
3638 */
Googler9398cc32022-12-02 17:21:52 +08003639static noinline int __push_leaf_right(struct btrfs_path *path,
Googleraf606d22022-10-26 21:40:12 -07003640 int data_size, int empty,
3641 struct extent_buffer *right,
3642 int free_space, u32 left_nritems,
3643 u32 min_slot)
3644{
Googler9398cc32022-12-02 17:21:52 +08003645 struct btrfs_fs_info *fs_info = right->fs_info;
Googleraf606d22022-10-26 21:40:12 -07003646 struct extent_buffer *left = path->nodes[0];
3647 struct extent_buffer *upper = path->nodes[1];
3648 struct btrfs_map_token token;
3649 struct btrfs_disk_key disk_key;
3650 int slot;
3651 u32 i;
3652 int push_space = 0;
3653 int push_items = 0;
3654 struct btrfs_item *item;
3655 u32 nr;
3656 u32 right_nritems;
3657 u32 data_end;
3658 u32 this_item_size;
3659
Googleraf606d22022-10-26 21:40:12 -07003660 if (empty)
3661 nr = 0;
3662 else
3663 nr = max_t(u32, 1, min_slot);
3664
3665 if (path->slots[0] >= left_nritems)
3666 push_space += data_size;
3667
3668 slot = path->slots[1];
3669 i = left_nritems - 1;
3670 while (i >= nr) {
3671 item = btrfs_item_nr(i);
3672
3673 if (!empty && push_items > 0) {
3674 if (path->slots[0] > i)
3675 break;
3676 if (path->slots[0] == i) {
Googler9398cc32022-12-02 17:21:52 +08003677 int space = btrfs_leaf_free_space(left);
3678
Googleraf606d22022-10-26 21:40:12 -07003679 if (space + push_space * 2 > free_space)
3680 break;
3681 }
3682 }
3683
3684 if (path->slots[0] == i)
3685 push_space += data_size;
3686
3687 this_item_size = btrfs_item_size(left, item);
3688 if (this_item_size + sizeof(*item) + push_space > free_space)
3689 break;
3690
3691 push_items++;
3692 push_space += this_item_size + sizeof(*item);
3693 if (i == 0)
3694 break;
3695 i--;
3696 }
3697
3698 if (push_items == 0)
3699 goto out_unlock;
3700
3701 WARN_ON(!empty && push_items == left_nritems);
3702
3703 /* push left to right */
3704 right_nritems = btrfs_header_nritems(right);
3705
3706 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
Googler9398cc32022-12-02 17:21:52 +08003707 push_space -= leaf_data_end(left);
Googleraf606d22022-10-26 21:40:12 -07003708
3709 /* make room in the right data area */
Googler9398cc32022-12-02 17:21:52 +08003710 data_end = leaf_data_end(right);
Googleraf606d22022-10-26 21:40:12 -07003711 memmove_extent_buffer(right,
Googler9398cc32022-12-02 17:21:52 +08003712 BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3713 BTRFS_LEAF_DATA_OFFSET + data_end,
3714 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
Googleraf606d22022-10-26 21:40:12 -07003715
3716 /* copy from the left data area */
Googler9398cc32022-12-02 17:21:52 +08003717 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3718 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3719 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
Googleraf606d22022-10-26 21:40:12 -07003720 push_space);
3721
3722 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3723 btrfs_item_nr_offset(0),
3724 right_nritems * sizeof(struct btrfs_item));
3725
3726 /* copy the items from left to right */
3727 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3728 btrfs_item_nr_offset(left_nritems - push_items),
3729 push_items * sizeof(struct btrfs_item));
3730
3731 /* update the item pointers */
Googler9398cc32022-12-02 17:21:52 +08003732 btrfs_init_map_token(&token, right);
Googleraf606d22022-10-26 21:40:12 -07003733 right_nritems += push_items;
3734 btrfs_set_header_nritems(right, right_nritems);
Googler9398cc32022-12-02 17:21:52 +08003735 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
Googleraf606d22022-10-26 21:40:12 -07003736 for (i = 0; i < right_nritems; i++) {
3737 item = btrfs_item_nr(i);
3738 push_space -= btrfs_token_item_size(right, item, &token);
3739 btrfs_set_token_item_offset(right, item, push_space, &token);
3740 }
3741
3742 left_nritems -= push_items;
3743 btrfs_set_header_nritems(left, left_nritems);
3744
3745 if (left_nritems)
3746 btrfs_mark_buffer_dirty(left);
3747 else
Googler9398cc32022-12-02 17:21:52 +08003748 btrfs_clean_tree_block(left);
Googleraf606d22022-10-26 21:40:12 -07003749
3750 btrfs_mark_buffer_dirty(right);
3751
3752 btrfs_item_key(right, &disk_key, 0);
3753 btrfs_set_node_key(upper, &disk_key, slot + 1);
3754 btrfs_mark_buffer_dirty(upper);
3755
3756 /* then fixup the leaf pointer in the path */
3757 if (path->slots[0] >= left_nritems) {
3758 path->slots[0] -= left_nritems;
3759 if (btrfs_header_nritems(path->nodes[0]) == 0)
Googler9398cc32022-12-02 17:21:52 +08003760 btrfs_clean_tree_block(path->nodes[0]);
Googleraf606d22022-10-26 21:40:12 -07003761 btrfs_tree_unlock(path->nodes[0]);
3762 free_extent_buffer(path->nodes[0]);
3763 path->nodes[0] = right;
3764 path->slots[1] += 1;
3765 } else {
3766 btrfs_tree_unlock(right);
3767 free_extent_buffer(right);
3768 }
3769 return 0;
3770
3771out_unlock:
3772 btrfs_tree_unlock(right);
3773 free_extent_buffer(right);
3774 return 1;
3775}
3776
3777/*
3778 * push some data in the path leaf to the right, trying to free up at
3779 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3780 *
3781 * returns 1 if the push failed because the other node didn't have enough
3782 * room, 0 if everything worked out and < 0 if there were major errors.
3783 *
3784 * this will push starting from min_slot to the end of the leaf. It won't
3785 * push any slot lower than min_slot
3786 */
3787static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3788 *root, struct btrfs_path *path,
3789 int min_data_size, int data_size,
3790 int empty, u32 min_slot)
3791{
Googleraf606d22022-10-26 21:40:12 -07003792 struct extent_buffer *left = path->nodes[0];
3793 struct extent_buffer *right;
3794 struct extent_buffer *upper;
3795 int slot;
3796 int free_space;
3797 u32 left_nritems;
3798 int ret;
3799
3800 if (!path->nodes[1])
3801 return 1;
3802
3803 slot = path->slots[1];
3804 upper = path->nodes[1];
3805 if (slot >= btrfs_header_nritems(upper) - 1)
3806 return 1;
3807
3808 btrfs_assert_tree_locked(path->nodes[1]);
3809
Googler9398cc32022-12-02 17:21:52 +08003810 right = btrfs_read_node_slot(upper, slot + 1);
Googleraf606d22022-10-26 21:40:12 -07003811 /*
3812 * slot + 1 is not valid or we fail to read the right node,
3813 * no big deal, just return.
3814 */
3815 if (IS_ERR(right))
3816 return 1;
3817
3818 btrfs_tree_lock(right);
Googler9398cc32022-12-02 17:21:52 +08003819 btrfs_set_lock_blocking_write(right);
Googleraf606d22022-10-26 21:40:12 -07003820
Googler9398cc32022-12-02 17:21:52 +08003821 free_space = btrfs_leaf_free_space(right);
Googleraf606d22022-10-26 21:40:12 -07003822 if (free_space < data_size)
3823 goto out_unlock;
3824
3825 /* cow and double check */
3826 ret = btrfs_cow_block(trans, root, right, upper,
3827 slot + 1, &right);
3828 if (ret)
3829 goto out_unlock;
3830
Googler9398cc32022-12-02 17:21:52 +08003831 free_space = btrfs_leaf_free_space(right);
Googleraf606d22022-10-26 21:40:12 -07003832 if (free_space < data_size)
3833 goto out_unlock;
3834
3835 left_nritems = btrfs_header_nritems(left);
3836 if (left_nritems == 0)
3837 goto out_unlock;
3838
3839 if (path->slots[0] == left_nritems && !empty) {
3840 /* Key greater than all keys in the leaf, right neighbor has
3841 * enough room for it and we're not emptying our leaf to delete
3842 * it, therefore use right neighbor to insert the new item and
Googler9398cc32022-12-02 17:21:52 +08003843 * no need to touch/dirty our left leaf. */
Googleraf606d22022-10-26 21:40:12 -07003844 btrfs_tree_unlock(left);
3845 free_extent_buffer(left);
3846 path->nodes[0] = right;
3847 path->slots[0] = 0;
3848 path->slots[1]++;
3849 return 0;
3850 }
3851
Googler9398cc32022-12-02 17:21:52 +08003852 return __push_leaf_right(path, min_data_size, empty,
Googleraf606d22022-10-26 21:40:12 -07003853 right, free_space, left_nritems, min_slot);
3854out_unlock:
3855 btrfs_tree_unlock(right);
3856 free_extent_buffer(right);
3857 return 1;
3858}
3859
3860/*
3861 * push some data in the path leaf to the left, trying to free up at
3862 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3863 *
3864 * max_slot can put a limit on how far into the leaf we'll push items. The
3865 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3866 * items
3867 */
Googler9398cc32022-12-02 17:21:52 +08003868static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
Googleraf606d22022-10-26 21:40:12 -07003869 int empty, struct extent_buffer *left,
3870 int free_space, u32 right_nritems,
3871 u32 max_slot)
3872{
Googler9398cc32022-12-02 17:21:52 +08003873 struct btrfs_fs_info *fs_info = left->fs_info;
Googleraf606d22022-10-26 21:40:12 -07003874 struct btrfs_disk_key disk_key;
3875 struct extent_buffer *right = path->nodes[0];
3876 int i;
3877 int push_space = 0;
3878 int push_items = 0;
3879 struct btrfs_item *item;
3880 u32 old_left_nritems;
3881 u32 nr;
3882 int ret = 0;
3883 u32 this_item_size;
3884 u32 old_left_item_size;
3885 struct btrfs_map_token token;
3886
Googleraf606d22022-10-26 21:40:12 -07003887 if (empty)
3888 nr = min(right_nritems, max_slot);
3889 else
3890 nr = min(right_nritems - 1, max_slot);
3891
3892 for (i = 0; i < nr; i++) {
3893 item = btrfs_item_nr(i);
3894
3895 if (!empty && push_items > 0) {
3896 if (path->slots[0] < i)
3897 break;
3898 if (path->slots[0] == i) {
Googler9398cc32022-12-02 17:21:52 +08003899 int space = btrfs_leaf_free_space(right);
3900
Googleraf606d22022-10-26 21:40:12 -07003901 if (space + push_space * 2 > free_space)
3902 break;
3903 }
3904 }
3905
3906 if (path->slots[0] == i)
3907 push_space += data_size;
3908
3909 this_item_size = btrfs_item_size(right, item);
3910 if (this_item_size + sizeof(*item) + push_space > free_space)
3911 break;
3912
3913 push_items++;
3914 push_space += this_item_size + sizeof(*item);
3915 }
3916
3917 if (push_items == 0) {
3918 ret = 1;
3919 goto out;
3920 }
3921 WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3922
3923 /* push data from right to left */
3924 copy_extent_buffer(left, right,
3925 btrfs_item_nr_offset(btrfs_header_nritems(left)),
3926 btrfs_item_nr_offset(0),
3927 push_items * sizeof(struct btrfs_item));
3928
Googler9398cc32022-12-02 17:21:52 +08003929 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
Googleraf606d22022-10-26 21:40:12 -07003930 btrfs_item_offset_nr(right, push_items - 1);
3931
Googler9398cc32022-12-02 17:21:52 +08003932 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3933 leaf_data_end(left) - push_space,
3934 BTRFS_LEAF_DATA_OFFSET +
Googleraf606d22022-10-26 21:40:12 -07003935 btrfs_item_offset_nr(right, push_items - 1),
3936 push_space);
3937 old_left_nritems = btrfs_header_nritems(left);
3938 BUG_ON(old_left_nritems <= 0);
3939
Googler9398cc32022-12-02 17:21:52 +08003940 btrfs_init_map_token(&token, left);
Googleraf606d22022-10-26 21:40:12 -07003941 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3942 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3943 u32 ioff;
3944
3945 item = btrfs_item_nr(i);
3946
3947 ioff = btrfs_token_item_offset(left, item, &token);
3948 btrfs_set_token_item_offset(left, item,
Googler9398cc32022-12-02 17:21:52 +08003949 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
Googleraf606d22022-10-26 21:40:12 -07003950 &token);
3951 }
3952 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3953
3954 /* fixup right node */
3955 if (push_items > right_nritems)
3956 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3957 right_nritems);
3958
3959 if (push_items < right_nritems) {
3960 push_space = btrfs_item_offset_nr(right, push_items - 1) -
Googler9398cc32022-12-02 17:21:52 +08003961 leaf_data_end(right);
3962 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3963 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3964 BTRFS_LEAF_DATA_OFFSET +
3965 leaf_data_end(right), push_space);
Googleraf606d22022-10-26 21:40:12 -07003966
3967 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3968 btrfs_item_nr_offset(push_items),
3969 (btrfs_header_nritems(right) - push_items) *
3970 sizeof(struct btrfs_item));
3971 }
Googler9398cc32022-12-02 17:21:52 +08003972
3973 btrfs_init_map_token(&token, right);
Googleraf606d22022-10-26 21:40:12 -07003974 right_nritems -= push_items;
3975 btrfs_set_header_nritems(right, right_nritems);
Googler9398cc32022-12-02 17:21:52 +08003976 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
Googleraf606d22022-10-26 21:40:12 -07003977 for (i = 0; i < right_nritems; i++) {
3978 item = btrfs_item_nr(i);
3979
3980 push_space = push_space - btrfs_token_item_size(right,
3981 item, &token);
3982 btrfs_set_token_item_offset(right, item, push_space, &token);
3983 }
3984
3985 btrfs_mark_buffer_dirty(left);
3986 if (right_nritems)
3987 btrfs_mark_buffer_dirty(right);
3988 else
Googler9398cc32022-12-02 17:21:52 +08003989 btrfs_clean_tree_block(right);
Googleraf606d22022-10-26 21:40:12 -07003990
3991 btrfs_item_key(right, &disk_key, 0);
Googler9398cc32022-12-02 17:21:52 +08003992 fixup_low_keys(path, &disk_key, 1);
Googleraf606d22022-10-26 21:40:12 -07003993
3994 /* then fixup the leaf pointer in the path */
3995 if (path->slots[0] < push_items) {
3996 path->slots[0] += old_left_nritems;
3997 btrfs_tree_unlock(path->nodes[0]);
3998 free_extent_buffer(path->nodes[0]);
3999 path->nodes[0] = left;
4000 path->slots[1] -= 1;
4001 } else {
4002 btrfs_tree_unlock(left);
4003 free_extent_buffer(left);
4004 path->slots[0] -= push_items;
4005 }
4006 BUG_ON(path->slots[0] < 0);
4007 return ret;
4008out:
4009 btrfs_tree_unlock(left);
4010 free_extent_buffer(left);
4011 return ret;
4012}
4013
4014/*
4015 * push some data in the path leaf to the left, trying to free up at
4016 * least data_size bytes. returns zero if the push worked, nonzero otherwise
4017 *
4018 * max_slot can put a limit on how far into the leaf we'll push items. The
4019 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
4020 * items
4021 */
4022static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4023 *root, struct btrfs_path *path, int min_data_size,
4024 int data_size, int empty, u32 max_slot)
4025{
Googleraf606d22022-10-26 21:40:12 -07004026 struct extent_buffer *right = path->nodes[0];
4027 struct extent_buffer *left;
4028 int slot;
4029 int free_space;
4030 u32 right_nritems;
4031 int ret = 0;
4032
4033 slot = path->slots[1];
4034 if (slot == 0)
4035 return 1;
4036 if (!path->nodes[1])
4037 return 1;
4038
4039 right_nritems = btrfs_header_nritems(right);
4040 if (right_nritems == 0)
4041 return 1;
4042
4043 btrfs_assert_tree_locked(path->nodes[1]);
4044
Googler9398cc32022-12-02 17:21:52 +08004045 left = btrfs_read_node_slot(path->nodes[1], slot - 1);
Googleraf606d22022-10-26 21:40:12 -07004046 /*
4047 * slot - 1 is not valid or we fail to read the left node,
4048 * no big deal, just return.
4049 */
4050 if (IS_ERR(left))
4051 return 1;
4052
4053 btrfs_tree_lock(left);
Googler9398cc32022-12-02 17:21:52 +08004054 btrfs_set_lock_blocking_write(left);
Googleraf606d22022-10-26 21:40:12 -07004055
Googler9398cc32022-12-02 17:21:52 +08004056 free_space = btrfs_leaf_free_space(left);
Googleraf606d22022-10-26 21:40:12 -07004057 if (free_space < data_size) {
4058 ret = 1;
4059 goto out;
4060 }
4061
4062 /* cow and double check */
4063 ret = btrfs_cow_block(trans, root, left,
4064 path->nodes[1], slot - 1, &left);
4065 if (ret) {
4066 /* we hit -ENOSPC, but it isn't fatal here */
4067 if (ret == -ENOSPC)
4068 ret = 1;
4069 goto out;
4070 }
4071
Googler9398cc32022-12-02 17:21:52 +08004072 free_space = btrfs_leaf_free_space(left);
Googleraf606d22022-10-26 21:40:12 -07004073 if (free_space < data_size) {
4074 ret = 1;
4075 goto out;
4076 }
4077
Googler9398cc32022-12-02 17:21:52 +08004078 return __push_leaf_left(path, min_data_size,
Googleraf606d22022-10-26 21:40:12 -07004079 empty, left, free_space, right_nritems,
4080 max_slot);
4081out:
4082 btrfs_tree_unlock(left);
4083 free_extent_buffer(left);
4084 return ret;
4085}
4086
4087/*
4088 * split the path's leaf in two, making sure there is at least data_size
4089 * available for the resulting leaf level of the path.
4090 */
4091static noinline void copy_for_split(struct btrfs_trans_handle *trans,
Googleraf606d22022-10-26 21:40:12 -07004092 struct btrfs_path *path,
4093 struct extent_buffer *l,
4094 struct extent_buffer *right,
4095 int slot, int mid, int nritems)
4096{
Googler9398cc32022-12-02 17:21:52 +08004097 struct btrfs_fs_info *fs_info = trans->fs_info;
Googleraf606d22022-10-26 21:40:12 -07004098 int data_copy_size;
4099 int rt_data_off;
4100 int i;
4101 struct btrfs_disk_key disk_key;
4102 struct btrfs_map_token token;
4103
Googleraf606d22022-10-26 21:40:12 -07004104 nritems = nritems - mid;
4105 btrfs_set_header_nritems(right, nritems);
Googler9398cc32022-12-02 17:21:52 +08004106 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
Googleraf606d22022-10-26 21:40:12 -07004107
4108 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4109 btrfs_item_nr_offset(mid),
4110 nritems * sizeof(struct btrfs_item));
4111
4112 copy_extent_buffer(right, l,
Googler9398cc32022-12-02 17:21:52 +08004113 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4114 data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4115 leaf_data_end(l), data_copy_size);
Googleraf606d22022-10-26 21:40:12 -07004116
Googler9398cc32022-12-02 17:21:52 +08004117 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
Googleraf606d22022-10-26 21:40:12 -07004118
Googler9398cc32022-12-02 17:21:52 +08004119 btrfs_init_map_token(&token, right);
Googleraf606d22022-10-26 21:40:12 -07004120 for (i = 0; i < nritems; i++) {
4121 struct btrfs_item *item = btrfs_item_nr(i);
4122 u32 ioff;
4123
4124 ioff = btrfs_token_item_offset(right, item, &token);
4125 btrfs_set_token_item_offset(right, item,
4126 ioff + rt_data_off, &token);
4127 }
4128
4129 btrfs_set_header_nritems(l, mid);
4130 btrfs_item_key(right, &disk_key, 0);
Googler9398cc32022-12-02 17:21:52 +08004131 insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
Googleraf606d22022-10-26 21:40:12 -07004132
4133 btrfs_mark_buffer_dirty(right);
4134 btrfs_mark_buffer_dirty(l);
4135 BUG_ON(path->slots[0] != slot);
4136
4137 if (mid <= slot) {
4138 btrfs_tree_unlock(path->nodes[0]);
4139 free_extent_buffer(path->nodes[0]);
4140 path->nodes[0] = right;
4141 path->slots[0] -= mid;
4142 path->slots[1] += 1;
4143 } else {
4144 btrfs_tree_unlock(right);
4145 free_extent_buffer(right);
4146 }
4147
4148 BUG_ON(path->slots[0] < 0);
4149}
4150
4151/*
4152 * double splits happen when we need to insert a big item in the middle
4153 * of a leaf. A double split can leave us with 3 mostly empty leaves:
4154 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4155 * A B C
4156 *
4157 * We avoid this by trying to push the items on either side of our target
4158 * into the adjacent leaves. If all goes well we can avoid the double split
4159 * completely.
4160 */
4161static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4162 struct btrfs_root *root,
4163 struct btrfs_path *path,
4164 int data_size)
4165{
Googleraf606d22022-10-26 21:40:12 -07004166 int ret;
4167 int progress = 0;
4168 int slot;
4169 u32 nritems;
4170 int space_needed = data_size;
4171
4172 slot = path->slots[0];
4173 if (slot < btrfs_header_nritems(path->nodes[0]))
Googler9398cc32022-12-02 17:21:52 +08004174 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
Googleraf606d22022-10-26 21:40:12 -07004175
4176 /*
4177 * try to push all the items after our slot into the
4178 * right leaf
4179 */
4180 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4181 if (ret < 0)
4182 return ret;
4183
4184 if (ret == 0)
4185 progress++;
4186
4187 nritems = btrfs_header_nritems(path->nodes[0]);
4188 /*
4189 * our goal is to get our slot at the start or end of a leaf. If
4190 * we've done so we're done
4191 */
4192 if (path->slots[0] == 0 || path->slots[0] == nritems)
4193 return 0;
4194
Googler9398cc32022-12-02 17:21:52 +08004195 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
Googleraf606d22022-10-26 21:40:12 -07004196 return 0;
4197
4198 /* try to push all the items before our slot into the next leaf */
4199 slot = path->slots[0];
Googler9398cc32022-12-02 17:21:52 +08004200 space_needed = data_size;
4201 if (slot > 0)
4202 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
Googleraf606d22022-10-26 21:40:12 -07004203 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4204 if (ret < 0)
4205 return ret;
4206
4207 if (ret == 0)
4208 progress++;
4209
4210 if (progress)
4211 return 0;
4212 return 1;
4213}
4214
4215/*
4216 * split the path's leaf in two, making sure there is at least data_size
4217 * available for the resulting leaf level of the path.
4218 *
4219 * returns 0 if all went well and < 0 on failure.
4220 */
4221static noinline int split_leaf(struct btrfs_trans_handle *trans,
4222 struct btrfs_root *root,
Googler9398cc32022-12-02 17:21:52 +08004223 const struct btrfs_key *ins_key,
Googleraf606d22022-10-26 21:40:12 -07004224 struct btrfs_path *path, int data_size,
4225 int extend)
4226{
4227 struct btrfs_disk_key disk_key;
4228 struct extent_buffer *l;
4229 u32 nritems;
4230 int mid;
4231 int slot;
4232 struct extent_buffer *right;
4233 struct btrfs_fs_info *fs_info = root->fs_info;
4234 int ret = 0;
4235 int wret;
4236 int split;
4237 int num_doubles = 0;
4238 int tried_avoid_double = 0;
4239
4240 l = path->nodes[0];
4241 slot = path->slots[0];
4242 if (extend && data_size + btrfs_item_size_nr(l, slot) +
Googler9398cc32022-12-02 17:21:52 +08004243 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
Googleraf606d22022-10-26 21:40:12 -07004244 return -EOVERFLOW;
4245
4246 /* first try to make some room by pushing left and right */
4247 if (data_size && path->nodes[1]) {
4248 int space_needed = data_size;
4249
4250 if (slot < btrfs_header_nritems(l))
Googler9398cc32022-12-02 17:21:52 +08004251 space_needed -= btrfs_leaf_free_space(l);
Googleraf606d22022-10-26 21:40:12 -07004252
4253 wret = push_leaf_right(trans, root, path, space_needed,
4254 space_needed, 0, 0);
4255 if (wret < 0)
4256 return wret;
4257 if (wret) {
Googler9398cc32022-12-02 17:21:52 +08004258 space_needed = data_size;
4259 if (slot > 0)
4260 space_needed -= btrfs_leaf_free_space(l);
Googleraf606d22022-10-26 21:40:12 -07004261 wret = push_leaf_left(trans, root, path, space_needed,
4262 space_needed, 0, (u32)-1);
4263 if (wret < 0)
4264 return wret;
4265 }
4266 l = path->nodes[0];
4267
4268 /* did the pushes work? */
Googler9398cc32022-12-02 17:21:52 +08004269 if (btrfs_leaf_free_space(l) >= data_size)
Googleraf606d22022-10-26 21:40:12 -07004270 return 0;
4271 }
4272
4273 if (!path->nodes[1]) {
4274 ret = insert_new_root(trans, root, path, 1);
4275 if (ret)
4276 return ret;
4277 }
4278again:
4279 split = 1;
4280 l = path->nodes[0];
4281 slot = path->slots[0];
4282 nritems = btrfs_header_nritems(l);
4283 mid = (nritems + 1) / 2;
4284
4285 if (mid <= slot) {
4286 if (nritems == 1 ||
4287 leaf_space_used(l, mid, nritems - mid) + data_size >
Googler9398cc32022-12-02 17:21:52 +08004288 BTRFS_LEAF_DATA_SIZE(fs_info)) {
Googleraf606d22022-10-26 21:40:12 -07004289 if (slot >= nritems) {
4290 split = 0;
4291 } else {
4292 mid = slot;
4293 if (mid != nritems &&
4294 leaf_space_used(l, mid, nritems - mid) +
Googler9398cc32022-12-02 17:21:52 +08004295 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
Googleraf606d22022-10-26 21:40:12 -07004296 if (data_size && !tried_avoid_double)
4297 goto push_for_double;
4298 split = 2;
4299 }
4300 }
4301 }
4302 } else {
4303 if (leaf_space_used(l, 0, mid) + data_size >
Googler9398cc32022-12-02 17:21:52 +08004304 BTRFS_LEAF_DATA_SIZE(fs_info)) {
Googleraf606d22022-10-26 21:40:12 -07004305 if (!extend && data_size && slot == 0) {
4306 split = 0;
4307 } else if ((extend || !data_size) && slot == 0) {
4308 mid = 1;
4309 } else {
4310 mid = slot;
4311 if (mid != nritems &&
4312 leaf_space_used(l, mid, nritems - mid) +
Googler9398cc32022-12-02 17:21:52 +08004313 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
Googleraf606d22022-10-26 21:40:12 -07004314 if (data_size && !tried_avoid_double)
4315 goto push_for_double;
4316 split = 2;
4317 }
4318 }
4319 }
4320 }
4321
4322 if (split == 0)
4323 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4324 else
4325 btrfs_item_key(l, &disk_key, mid);
4326
Googler9398cc32022-12-02 17:21:52 +08004327 right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4328 l->start, 0);
Googleraf606d22022-10-26 21:40:12 -07004329 if (IS_ERR(right))
4330 return PTR_ERR(right);
4331
Googler9398cc32022-12-02 17:21:52 +08004332 root_add_used(root, fs_info->nodesize);
Googleraf606d22022-10-26 21:40:12 -07004333
4334 if (split == 0) {
4335 if (mid <= slot) {
4336 btrfs_set_header_nritems(right, 0);
Googler9398cc32022-12-02 17:21:52 +08004337 insert_ptr(trans, path, &disk_key,
4338 right->start, path->slots[1] + 1, 1);
Googleraf606d22022-10-26 21:40:12 -07004339 btrfs_tree_unlock(path->nodes[0]);
4340 free_extent_buffer(path->nodes[0]);
4341 path->nodes[0] = right;
4342 path->slots[0] = 0;
4343 path->slots[1] += 1;
4344 } else {
4345 btrfs_set_header_nritems(right, 0);
Googler9398cc32022-12-02 17:21:52 +08004346 insert_ptr(trans, path, &disk_key,
4347 right->start, path->slots[1], 1);
Googleraf606d22022-10-26 21:40:12 -07004348 btrfs_tree_unlock(path->nodes[0]);
4349 free_extent_buffer(path->nodes[0]);
4350 path->nodes[0] = right;
4351 path->slots[0] = 0;
4352 if (path->slots[1] == 0)
Googler9398cc32022-12-02 17:21:52 +08004353 fixup_low_keys(path, &disk_key, 1);
Googleraf606d22022-10-26 21:40:12 -07004354 }
4355 /*
4356 * We create a new leaf 'right' for the required ins_len and
4357 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4358 * the content of ins_len to 'right'.
4359 */
4360 return ret;
4361 }
4362
Googler9398cc32022-12-02 17:21:52 +08004363 copy_for_split(trans, path, l, right, slot, mid, nritems);
Googleraf606d22022-10-26 21:40:12 -07004364
4365 if (split == 2) {
4366 BUG_ON(num_doubles != 0);
4367 num_doubles++;
4368 goto again;
4369 }
4370
4371 return 0;
4372
4373push_for_double:
4374 push_for_double_split(trans, root, path, data_size);
4375 tried_avoid_double = 1;
Googler9398cc32022-12-02 17:21:52 +08004376 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
Googleraf606d22022-10-26 21:40:12 -07004377 return 0;
4378 goto again;
4379}
4380
4381static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4382 struct btrfs_root *root,
4383 struct btrfs_path *path, int ins_len)
4384{
Googleraf606d22022-10-26 21:40:12 -07004385 struct btrfs_key key;
4386 struct extent_buffer *leaf;
4387 struct btrfs_file_extent_item *fi;
4388 u64 extent_len = 0;
4389 u32 item_size;
4390 int ret;
4391
4392 leaf = path->nodes[0];
4393 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4394
4395 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4396 key.type != BTRFS_EXTENT_CSUM_KEY);
4397
Googler9398cc32022-12-02 17:21:52 +08004398 if (btrfs_leaf_free_space(leaf) >= ins_len)
Googleraf606d22022-10-26 21:40:12 -07004399 return 0;
4400
4401 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4402 if (key.type == BTRFS_EXTENT_DATA_KEY) {
4403 fi = btrfs_item_ptr(leaf, path->slots[0],
4404 struct btrfs_file_extent_item);
4405 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4406 }
4407 btrfs_release_path(path);
4408
4409 path->keep_locks = 1;
4410 path->search_for_split = 1;
4411 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4412 path->search_for_split = 0;
4413 if (ret > 0)
4414 ret = -EAGAIN;
4415 if (ret < 0)
4416 goto err;
4417
4418 ret = -EAGAIN;
4419 leaf = path->nodes[0];
4420 /* if our item isn't there, return now */
4421 if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4422 goto err;
4423
4424 /* the leaf has changed, it now has room. return now */
Googler9398cc32022-12-02 17:21:52 +08004425 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
Googleraf606d22022-10-26 21:40:12 -07004426 goto err;
4427
4428 if (key.type == BTRFS_EXTENT_DATA_KEY) {
4429 fi = btrfs_item_ptr(leaf, path->slots[0],
4430 struct btrfs_file_extent_item);
4431 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4432 goto err;
4433 }
4434
4435 btrfs_set_path_blocking(path);
4436 ret = split_leaf(trans, root, &key, path, ins_len, 1);
4437 if (ret)
4438 goto err;
4439
4440 path->keep_locks = 0;
4441 btrfs_unlock_up_safe(path, 1);
4442 return 0;
4443err:
4444 path->keep_locks = 0;
4445 return ret;
4446}
4447
Googler9398cc32022-12-02 17:21:52 +08004448static noinline int split_item(struct btrfs_path *path,
4449 const struct btrfs_key *new_key,
Googleraf606d22022-10-26 21:40:12 -07004450 unsigned long split_offset)
4451{
4452 struct extent_buffer *leaf;
4453 struct btrfs_item *item;
4454 struct btrfs_item *new_item;
4455 int slot;
4456 char *buf;
4457 u32 nritems;
4458 u32 item_size;
4459 u32 orig_offset;
4460 struct btrfs_disk_key disk_key;
4461
4462 leaf = path->nodes[0];
Googler9398cc32022-12-02 17:21:52 +08004463 BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
Googleraf606d22022-10-26 21:40:12 -07004464
4465 btrfs_set_path_blocking(path);
4466
4467 item = btrfs_item_nr(path->slots[0]);
4468 orig_offset = btrfs_item_offset(leaf, item);
4469 item_size = btrfs_item_size(leaf, item);
4470
4471 buf = kmalloc(item_size, GFP_NOFS);
4472 if (!buf)
4473 return -ENOMEM;
4474
4475 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4476 path->slots[0]), item_size);
4477
4478 slot = path->slots[0] + 1;
4479 nritems = btrfs_header_nritems(leaf);
4480 if (slot != nritems) {
4481 /* shift the items */
4482 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4483 btrfs_item_nr_offset(slot),
4484 (nritems - slot) * sizeof(struct btrfs_item));
4485 }
4486
4487 btrfs_cpu_key_to_disk(&disk_key, new_key);
4488 btrfs_set_item_key(leaf, &disk_key, slot);
4489
4490 new_item = btrfs_item_nr(slot);
4491
4492 btrfs_set_item_offset(leaf, new_item, orig_offset);
4493 btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4494
4495 btrfs_set_item_offset(leaf, item,
4496 orig_offset + item_size - split_offset);
4497 btrfs_set_item_size(leaf, item, split_offset);
4498
4499 btrfs_set_header_nritems(leaf, nritems + 1);
4500
4501 /* write the data for the start of the original item */
4502 write_extent_buffer(leaf, buf,
4503 btrfs_item_ptr_offset(leaf, path->slots[0]),
4504 split_offset);
4505
4506 /* write the data for the new item */
4507 write_extent_buffer(leaf, buf + split_offset,
4508 btrfs_item_ptr_offset(leaf, slot),
4509 item_size - split_offset);
4510 btrfs_mark_buffer_dirty(leaf);
4511
Googler9398cc32022-12-02 17:21:52 +08004512 BUG_ON(btrfs_leaf_free_space(leaf) < 0);
Googleraf606d22022-10-26 21:40:12 -07004513 kfree(buf);
4514 return 0;
4515}
4516
4517/*
4518 * This function splits a single item into two items,
4519 * giving 'new_key' to the new item and splitting the
4520 * old one at split_offset (from the start of the item).
4521 *
4522 * The path may be released by this operation. After
4523 * the split, the path is pointing to the old item. The
4524 * new item is going to be in the same node as the old one.
4525 *
4526 * Note, the item being split must be smaller enough to live alone on
4527 * a tree block with room for one extra struct btrfs_item
4528 *
4529 * This allows us to split the item in place, keeping a lock on the
4530 * leaf the entire time.
4531 */
4532int btrfs_split_item(struct btrfs_trans_handle *trans,
4533 struct btrfs_root *root,
4534 struct btrfs_path *path,
Googler9398cc32022-12-02 17:21:52 +08004535 const struct btrfs_key *new_key,
Googleraf606d22022-10-26 21:40:12 -07004536 unsigned long split_offset)
4537{
4538 int ret;
4539 ret = setup_leaf_for_split(trans, root, path,
4540 sizeof(struct btrfs_item));
4541 if (ret)
4542 return ret;
4543
Googler9398cc32022-12-02 17:21:52 +08004544 ret = split_item(path, new_key, split_offset);
Googleraf606d22022-10-26 21:40:12 -07004545 return ret;
4546}
4547
4548/*
4549 * This function duplicate a item, giving 'new_key' to the new item.
4550 * It guarantees both items live in the same tree leaf and the new item
4551 * is contiguous with the original item.
4552 *
4553 * This allows us to split file extent in place, keeping a lock on the
4554 * leaf the entire time.
4555 */
4556int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4557 struct btrfs_root *root,
4558 struct btrfs_path *path,
Googler9398cc32022-12-02 17:21:52 +08004559 const struct btrfs_key *new_key)
Googleraf606d22022-10-26 21:40:12 -07004560{
4561 struct extent_buffer *leaf;
4562 int ret;
4563 u32 item_size;
4564
4565 leaf = path->nodes[0];
4566 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4567 ret = setup_leaf_for_split(trans, root, path,
4568 item_size + sizeof(struct btrfs_item));
4569 if (ret)
4570 return ret;
4571
4572 path->slots[0]++;
4573 setup_items_for_insert(root, path, new_key, &item_size,
4574 item_size, item_size +
4575 sizeof(struct btrfs_item), 1);
4576 leaf = path->nodes[0];
4577 memcpy_extent_buffer(leaf,
4578 btrfs_item_ptr_offset(leaf, path->slots[0]),
4579 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4580 item_size);
4581 return 0;
4582}
4583
4584/*
4585 * make the item pointed to by the path smaller. new_size indicates
4586 * how small to make it, and from_end tells us if we just chop bytes
4587 * off the end of the item or if we shift the item to chop bytes off
4588 * the front.
4589 */
Googler9398cc32022-12-02 17:21:52 +08004590void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
Googleraf606d22022-10-26 21:40:12 -07004591{
4592 int slot;
4593 struct extent_buffer *leaf;
4594 struct btrfs_item *item;
4595 u32 nritems;
4596 unsigned int data_end;
4597 unsigned int old_data_start;
4598 unsigned int old_size;
4599 unsigned int size_diff;
4600 int i;
4601 struct btrfs_map_token token;
4602
Googleraf606d22022-10-26 21:40:12 -07004603 leaf = path->nodes[0];
4604 slot = path->slots[0];
4605
4606 old_size = btrfs_item_size_nr(leaf, slot);
4607 if (old_size == new_size)
4608 return;
4609
4610 nritems = btrfs_header_nritems(leaf);
Googler9398cc32022-12-02 17:21:52 +08004611 data_end = leaf_data_end(leaf);
Googleraf606d22022-10-26 21:40:12 -07004612
4613 old_data_start = btrfs_item_offset_nr(leaf, slot);
4614
4615 size_diff = old_size - new_size;
4616
4617 BUG_ON(slot < 0);
4618 BUG_ON(slot >= nritems);
4619
4620 /*
4621 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4622 */
4623 /* first correct the data pointers */
Googler9398cc32022-12-02 17:21:52 +08004624 btrfs_init_map_token(&token, leaf);
Googleraf606d22022-10-26 21:40:12 -07004625 for (i = slot; i < nritems; i++) {
4626 u32 ioff;
4627 item = btrfs_item_nr(i);
4628
4629 ioff = btrfs_token_item_offset(leaf, item, &token);
4630 btrfs_set_token_item_offset(leaf, item,
4631 ioff + size_diff, &token);
4632 }
4633
4634 /* shift the data */
4635 if (from_end) {
Googler9398cc32022-12-02 17:21:52 +08004636 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4637 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
Googleraf606d22022-10-26 21:40:12 -07004638 data_end, old_data_start + new_size - data_end);
4639 } else {
4640 struct btrfs_disk_key disk_key;
4641 u64 offset;
4642
4643 btrfs_item_key(leaf, &disk_key, slot);
4644
4645 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4646 unsigned long ptr;
4647 struct btrfs_file_extent_item *fi;
4648
4649 fi = btrfs_item_ptr(leaf, slot,
4650 struct btrfs_file_extent_item);
4651 fi = (struct btrfs_file_extent_item *)(
4652 (unsigned long)fi - size_diff);
4653
4654 if (btrfs_file_extent_type(leaf, fi) ==
4655 BTRFS_FILE_EXTENT_INLINE) {
4656 ptr = btrfs_item_ptr_offset(leaf, slot);
4657 memmove_extent_buffer(leaf, ptr,
4658 (unsigned long)fi,
4659 BTRFS_FILE_EXTENT_INLINE_DATA_START);
4660 }
4661 }
4662
Googler9398cc32022-12-02 17:21:52 +08004663 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4664 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
Googleraf606d22022-10-26 21:40:12 -07004665 data_end, old_data_start - data_end);
4666
4667 offset = btrfs_disk_key_offset(&disk_key);
4668 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4669 btrfs_set_item_key(leaf, &disk_key, slot);
4670 if (slot == 0)
Googler9398cc32022-12-02 17:21:52 +08004671 fixup_low_keys(path, &disk_key, 1);
Googleraf606d22022-10-26 21:40:12 -07004672 }
4673
4674 item = btrfs_item_nr(slot);
4675 btrfs_set_item_size(leaf, item, new_size);
4676 btrfs_mark_buffer_dirty(leaf);
4677
Googler9398cc32022-12-02 17:21:52 +08004678 if (btrfs_leaf_free_space(leaf) < 0) {
4679 btrfs_print_leaf(leaf);
Googleraf606d22022-10-26 21:40:12 -07004680 BUG();
4681 }
4682}
4683
4684/*
4685 * make the item pointed to by the path bigger, data_size is the added size.
4686 */
Googler9398cc32022-12-02 17:21:52 +08004687void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
Googleraf606d22022-10-26 21:40:12 -07004688{
4689 int slot;
4690 struct extent_buffer *leaf;
4691 struct btrfs_item *item;
4692 u32 nritems;
4693 unsigned int data_end;
4694 unsigned int old_data;
4695 unsigned int old_size;
4696 int i;
4697 struct btrfs_map_token token;
4698
Googleraf606d22022-10-26 21:40:12 -07004699 leaf = path->nodes[0];
4700
4701 nritems = btrfs_header_nritems(leaf);
Googler9398cc32022-12-02 17:21:52 +08004702 data_end = leaf_data_end(leaf);
Googleraf606d22022-10-26 21:40:12 -07004703
Googler9398cc32022-12-02 17:21:52 +08004704 if (btrfs_leaf_free_space(leaf) < data_size) {
4705 btrfs_print_leaf(leaf);
Googleraf606d22022-10-26 21:40:12 -07004706 BUG();
4707 }
4708 slot = path->slots[0];
4709 old_data = btrfs_item_end_nr(leaf, slot);
4710
4711 BUG_ON(slot < 0);
4712 if (slot >= nritems) {
Googler9398cc32022-12-02 17:21:52 +08004713 btrfs_print_leaf(leaf);
4714 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4715 slot, nritems);
4716 BUG();
Googleraf606d22022-10-26 21:40:12 -07004717 }
4718
4719 /*
4720 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4721 */
4722 /* first correct the data pointers */
Googler9398cc32022-12-02 17:21:52 +08004723 btrfs_init_map_token(&token, leaf);
Googleraf606d22022-10-26 21:40:12 -07004724 for (i = slot; i < nritems; i++) {
4725 u32 ioff;
4726 item = btrfs_item_nr(i);
4727
4728 ioff = btrfs_token_item_offset(leaf, item, &token);
4729 btrfs_set_token_item_offset(leaf, item,
4730 ioff - data_size, &token);
4731 }
4732
4733 /* shift the data */
Googler9398cc32022-12-02 17:21:52 +08004734 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4735 data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
Googleraf606d22022-10-26 21:40:12 -07004736 data_end, old_data - data_end);
4737
4738 data_end = old_data;
4739 old_size = btrfs_item_size_nr(leaf, slot);
4740 item = btrfs_item_nr(slot);
4741 btrfs_set_item_size(leaf, item, old_size + data_size);
4742 btrfs_mark_buffer_dirty(leaf);
4743
Googler9398cc32022-12-02 17:21:52 +08004744 if (btrfs_leaf_free_space(leaf) < 0) {
4745 btrfs_print_leaf(leaf);
Googleraf606d22022-10-26 21:40:12 -07004746 BUG();
4747 }
4748}
4749
4750/*
4751 * this is a helper for btrfs_insert_empty_items, the main goal here is
4752 * to save stack depth by doing the bulk of the work in a function
4753 * that doesn't call btrfs_search_slot
4754 */
4755void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
Googler9398cc32022-12-02 17:21:52 +08004756 const struct btrfs_key *cpu_key, u32 *data_size,
Googleraf606d22022-10-26 21:40:12 -07004757 u32 total_data, u32 total_size, int nr)
4758{
Googler9398cc32022-12-02 17:21:52 +08004759 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -07004760 struct btrfs_item *item;
4761 int i;
4762 u32 nritems;
4763 unsigned int data_end;
4764 struct btrfs_disk_key disk_key;
4765 struct extent_buffer *leaf;
4766 int slot;
4767 struct btrfs_map_token token;
4768
4769 if (path->slots[0] == 0) {
4770 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
Googler9398cc32022-12-02 17:21:52 +08004771 fixup_low_keys(path, &disk_key, 1);
Googleraf606d22022-10-26 21:40:12 -07004772 }
4773 btrfs_unlock_up_safe(path, 1);
4774
Googleraf606d22022-10-26 21:40:12 -07004775 leaf = path->nodes[0];
4776 slot = path->slots[0];
4777
4778 nritems = btrfs_header_nritems(leaf);
Googler9398cc32022-12-02 17:21:52 +08004779 data_end = leaf_data_end(leaf);
Googleraf606d22022-10-26 21:40:12 -07004780
Googler9398cc32022-12-02 17:21:52 +08004781 if (btrfs_leaf_free_space(leaf) < total_size) {
4782 btrfs_print_leaf(leaf);
4783 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4784 total_size, btrfs_leaf_free_space(leaf));
Googleraf606d22022-10-26 21:40:12 -07004785 BUG();
4786 }
4787
Googler9398cc32022-12-02 17:21:52 +08004788 btrfs_init_map_token(&token, leaf);
Googleraf606d22022-10-26 21:40:12 -07004789 if (slot != nritems) {
4790 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4791
4792 if (old_data < data_end) {
Googler9398cc32022-12-02 17:21:52 +08004793 btrfs_print_leaf(leaf);
4794 btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
Googleraf606d22022-10-26 21:40:12 -07004795 slot, old_data, data_end);
Googler9398cc32022-12-02 17:21:52 +08004796 BUG();
Googleraf606d22022-10-26 21:40:12 -07004797 }
4798 /*
4799 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4800 */
4801 /* first correct the data pointers */
4802 for (i = slot; i < nritems; i++) {
4803 u32 ioff;
4804
4805 item = btrfs_item_nr(i);
4806 ioff = btrfs_token_item_offset(leaf, item, &token);
4807 btrfs_set_token_item_offset(leaf, item,
4808 ioff - total_data, &token);
4809 }
4810 /* shift the items */
4811 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4812 btrfs_item_nr_offset(slot),
4813 (nritems - slot) * sizeof(struct btrfs_item));
4814
4815 /* shift the data */
Googler9398cc32022-12-02 17:21:52 +08004816 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4817 data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
Googleraf606d22022-10-26 21:40:12 -07004818 data_end, old_data - data_end);
4819 data_end = old_data;
4820 }
4821
4822 /* setup the item for the new data */
4823 for (i = 0; i < nr; i++) {
4824 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4825 btrfs_set_item_key(leaf, &disk_key, slot + i);
4826 item = btrfs_item_nr(slot + i);
4827 btrfs_set_token_item_offset(leaf, item,
4828 data_end - data_size[i], &token);
4829 data_end -= data_size[i];
4830 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4831 }
4832
4833 btrfs_set_header_nritems(leaf, nritems + nr);
4834 btrfs_mark_buffer_dirty(leaf);
4835
Googler9398cc32022-12-02 17:21:52 +08004836 if (btrfs_leaf_free_space(leaf) < 0) {
4837 btrfs_print_leaf(leaf);
Googleraf606d22022-10-26 21:40:12 -07004838 BUG();
4839 }
4840}
4841
4842/*
4843 * Given a key and some data, insert items into the tree.
4844 * This does all the path init required, making room in the tree if needed.
4845 */
4846int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4847 struct btrfs_root *root,
4848 struct btrfs_path *path,
Googler9398cc32022-12-02 17:21:52 +08004849 const struct btrfs_key *cpu_key, u32 *data_size,
Googleraf606d22022-10-26 21:40:12 -07004850 int nr)
4851{
4852 int ret = 0;
4853 int slot;
4854 int i;
4855 u32 total_size = 0;
4856 u32 total_data = 0;
4857
4858 for (i = 0; i < nr; i++)
4859 total_data += data_size[i];
4860
4861 total_size = total_data + (nr * sizeof(struct btrfs_item));
4862 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4863 if (ret == 0)
4864 return -EEXIST;
4865 if (ret < 0)
4866 return ret;
4867
4868 slot = path->slots[0];
4869 BUG_ON(slot < 0);
4870
4871 setup_items_for_insert(root, path, cpu_key, data_size,
4872 total_data, total_size, nr);
4873 return 0;
4874}
4875
4876/*
4877 * Given a key and some data, insert an item into the tree.
4878 * This does all the path init required, making room in the tree if needed.
4879 */
Googler9398cc32022-12-02 17:21:52 +08004880int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4881 const struct btrfs_key *cpu_key, void *data,
4882 u32 data_size)
Googleraf606d22022-10-26 21:40:12 -07004883{
4884 int ret = 0;
4885 struct btrfs_path *path;
4886 struct extent_buffer *leaf;
4887 unsigned long ptr;
4888
4889 path = btrfs_alloc_path();
4890 if (!path)
4891 return -ENOMEM;
4892 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4893 if (!ret) {
4894 leaf = path->nodes[0];
4895 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4896 write_extent_buffer(leaf, data, ptr, data_size);
4897 btrfs_mark_buffer_dirty(leaf);
4898 }
4899 btrfs_free_path(path);
4900 return ret;
4901}
4902
4903/*
4904 * delete the pointer from a given node.
4905 *
4906 * the tree should have been previously balanced so the deletion does not
4907 * empty a node.
4908 */
4909static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4910 int level, int slot)
4911{
4912 struct extent_buffer *parent = path->nodes[level];
4913 u32 nritems;
4914 int ret;
4915
4916 nritems = btrfs_header_nritems(parent);
4917 if (slot != nritems - 1) {
Googler9398cc32022-12-02 17:21:52 +08004918 if (level) {
4919 ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4920 nritems - slot - 1);
4921 BUG_ON(ret < 0);
4922 }
Googleraf606d22022-10-26 21:40:12 -07004923 memmove_extent_buffer(parent,
4924 btrfs_node_key_ptr_offset(slot),
4925 btrfs_node_key_ptr_offset(slot + 1),
4926 sizeof(struct btrfs_key_ptr) *
4927 (nritems - slot - 1));
4928 } else if (level) {
Googler9398cc32022-12-02 17:21:52 +08004929 ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4930 GFP_NOFS);
Googleraf606d22022-10-26 21:40:12 -07004931 BUG_ON(ret < 0);
4932 }
4933
4934 nritems--;
4935 btrfs_set_header_nritems(parent, nritems);
4936 if (nritems == 0 && parent == root->node) {
4937 BUG_ON(btrfs_header_level(root->node) != 1);
4938 /* just turn the root into a leaf and break */
4939 btrfs_set_header_level(root->node, 0);
4940 } else if (slot == 0) {
4941 struct btrfs_disk_key disk_key;
4942
4943 btrfs_node_key(parent, &disk_key, 0);
Googler9398cc32022-12-02 17:21:52 +08004944 fixup_low_keys(path, &disk_key, level + 1);
Googleraf606d22022-10-26 21:40:12 -07004945 }
4946 btrfs_mark_buffer_dirty(parent);
4947}
4948
4949/*
4950 * a helper function to delete the leaf pointed to by path->slots[1] and
4951 * path->nodes[1].
4952 *
4953 * This deletes the pointer in path->nodes[1] and frees the leaf
4954 * block extent. zero is returned if it all worked out, < 0 otherwise.
4955 *
4956 * The path must have already been setup for deleting the leaf, including
4957 * all the proper balancing. path->nodes[1] must be locked.
4958 */
4959static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4960 struct btrfs_root *root,
4961 struct btrfs_path *path,
4962 struct extent_buffer *leaf)
4963{
4964 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4965 del_ptr(root, path, 1, path->slots[1]);
4966
4967 /*
4968 * btrfs_free_extent is expensive, we want to make sure we
4969 * aren't holding any locks when we call it
4970 */
4971 btrfs_unlock_up_safe(path, 0);
4972
4973 root_sub_used(root, leaf->len);
4974
4975 extent_buffer_get(leaf);
4976 btrfs_free_tree_block(trans, root, leaf, 0, 1);
4977 free_extent_buffer_stale(leaf);
4978}
4979/*
4980 * delete the item at the leaf level in path. If that empties
4981 * the leaf, remove it from the tree
4982 */
4983int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4984 struct btrfs_path *path, int slot, int nr)
4985{
Googler9398cc32022-12-02 17:21:52 +08004986 struct btrfs_fs_info *fs_info = root->fs_info;
Googleraf606d22022-10-26 21:40:12 -07004987 struct extent_buffer *leaf;
4988 struct btrfs_item *item;
4989 u32 last_off;
4990 u32 dsize = 0;
4991 int ret = 0;
4992 int wret;
4993 int i;
4994 u32 nritems;
Googleraf606d22022-10-26 21:40:12 -07004995
4996 leaf = path->nodes[0];
4997 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4998
4999 for (i = 0; i < nr; i++)
5000 dsize += btrfs_item_size_nr(leaf, slot + i);
5001
5002 nritems = btrfs_header_nritems(leaf);
5003
5004 if (slot + nr != nritems) {
Googler9398cc32022-12-02 17:21:52 +08005005 int data_end = leaf_data_end(leaf);
5006 struct btrfs_map_token token;
Googleraf606d22022-10-26 21:40:12 -07005007
Googler9398cc32022-12-02 17:21:52 +08005008 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
Googleraf606d22022-10-26 21:40:12 -07005009 data_end + dsize,
Googler9398cc32022-12-02 17:21:52 +08005010 BTRFS_LEAF_DATA_OFFSET + data_end,
Googleraf606d22022-10-26 21:40:12 -07005011 last_off - data_end);
5012
Googler9398cc32022-12-02 17:21:52 +08005013 btrfs_init_map_token(&token, leaf);
Googleraf606d22022-10-26 21:40:12 -07005014 for (i = slot + nr; i < nritems; i++) {
5015 u32 ioff;
5016
5017 item = btrfs_item_nr(i);
5018 ioff = btrfs_token_item_offset(leaf, item, &token);
5019 btrfs_set_token_item_offset(leaf, item,
5020 ioff + dsize, &token);
5021 }
5022
5023 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5024 btrfs_item_nr_offset(slot + nr),
5025 sizeof(struct btrfs_item) *
5026 (nritems - slot - nr));
5027 }
5028 btrfs_set_header_nritems(leaf, nritems - nr);
5029 nritems -= nr;
5030
5031 /* delete the leaf if we've emptied it */
5032 if (nritems == 0) {
5033 if (leaf == root->node) {
5034 btrfs_set_header_level(leaf, 0);
5035 } else {
5036 btrfs_set_path_blocking(path);
Googler9398cc32022-12-02 17:21:52 +08005037 btrfs_clean_tree_block(leaf);
Googleraf606d22022-10-26 21:40:12 -07005038 btrfs_del_leaf(trans, root, path, leaf);
5039 }
5040 } else {
5041 int used = leaf_space_used(leaf, 0, nritems);
5042 if (slot == 0) {
5043 struct btrfs_disk_key disk_key;
5044
5045 btrfs_item_key(leaf, &disk_key, 0);
Googler9398cc32022-12-02 17:21:52 +08005046 fixup_low_keys(path, &disk_key, 1);
Googleraf606d22022-10-26 21:40:12 -07005047 }
5048
5049 /* delete the leaf if it is mostly empty */
Googler9398cc32022-12-02 17:21:52 +08005050 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
Googleraf606d22022-10-26 21:40:12 -07005051 /* push_leaf_left fixes the path.
5052 * make sure the path still points to our leaf
5053 * for possible call to del_ptr below
5054 */
5055 slot = path->slots[1];
5056 extent_buffer_get(leaf);
5057
5058 btrfs_set_path_blocking(path);
5059 wret = push_leaf_left(trans, root, path, 1, 1,
5060 1, (u32)-1);
5061 if (wret < 0 && wret != -ENOSPC)
5062 ret = wret;
5063
5064 if (path->nodes[0] == leaf &&
5065 btrfs_header_nritems(leaf)) {
5066 wret = push_leaf_right(trans, root, path, 1,
5067 1, 1, 0);
5068 if (wret < 0 && wret != -ENOSPC)
5069 ret = wret;
5070 }
5071
5072 if (btrfs_header_nritems(leaf) == 0) {
5073 path->slots[1] = slot;
5074 btrfs_del_leaf(trans, root, path, leaf);
5075 free_extent_buffer(leaf);
5076 ret = 0;
5077 } else {
5078 /* if we're still in the path, make sure
5079 * we're dirty. Otherwise, one of the
5080 * push_leaf functions must have already
5081 * dirtied this buffer
5082 */
5083 if (path->nodes[0] == leaf)
5084 btrfs_mark_buffer_dirty(leaf);
5085 free_extent_buffer(leaf);
5086 }
5087 } else {
5088 btrfs_mark_buffer_dirty(leaf);
5089 }
5090 }
5091 return ret;
5092}
5093
5094/*
5095 * search the tree again to find a leaf with lesser keys
5096 * returns 0 if it found something or 1 if there are no lesser leaves.
5097 * returns < 0 on io errors.
5098 *
5099 * This may release the path, and so you may lose any locks held at the
5100 * time you call it.
5101 */
5102int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5103{
5104 struct btrfs_key key;
5105 struct btrfs_disk_key found_key;
5106 int ret;
5107
5108 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5109
5110 if (key.offset > 0) {
5111 key.offset--;
5112 } else if (key.type > 0) {
5113 key.type--;
5114 key.offset = (u64)-1;
5115 } else if (key.objectid > 0) {
5116 key.objectid--;
5117 key.type = (u8)-1;
5118 key.offset = (u64)-1;
5119 } else {
5120 return 1;
5121 }
5122
5123 btrfs_release_path(path);
5124 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5125 if (ret < 0)
5126 return ret;
5127 btrfs_item_key(path->nodes[0], &found_key, 0);
5128 ret = comp_keys(&found_key, &key);
5129 /*
5130 * We might have had an item with the previous key in the tree right
5131 * before we released our path. And after we released our path, that
5132 * item might have been pushed to the first slot (0) of the leaf we
5133 * were holding due to a tree balance. Alternatively, an item with the
5134 * previous key can exist as the only element of a leaf (big fat item).
5135 * Therefore account for these 2 cases, so that our callers (like
5136 * btrfs_previous_item) don't miss an existing item with a key matching
5137 * the previous key we computed above.
5138 */
5139 if (ret <= 0)
5140 return 0;
5141 return 1;
5142}
5143
5144/*
5145 * A helper function to walk down the tree starting at min_key, and looking
5146 * for nodes or leaves that are have a minimum transaction id.
5147 * This is used by the btree defrag code, and tree logging
5148 *
5149 * This does not cow, but it does stuff the starting key it finds back
5150 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5151 * key and get a writable path.
5152 *
5153 * This honors path->lowest_level to prevent descent past a given level
5154 * of the tree.
5155 *
5156 * min_trans indicates the oldest transaction that you are interested
5157 * in walking through. Any nodes or leaves older than min_trans are
5158 * skipped over (without reading them).
5159 *
5160 * returns zero if something useful was found, < 0 on error and 1 if there
5161 * was nothing in the tree that matched the search criteria.
5162 */
5163int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5164 struct btrfs_path *path,
5165 u64 min_trans)
5166{
Googleraf606d22022-10-26 21:40:12 -07005167 struct extent_buffer *cur;
5168 struct btrfs_key found_key;
5169 int slot;
5170 int sret;
5171 u32 nritems;
5172 int level;
5173 int ret = 1;
5174 int keep_locks = path->keep_locks;
5175
5176 path->keep_locks = 1;
5177again:
5178 cur = btrfs_read_lock_root_node(root);
5179 level = btrfs_header_level(cur);
5180 WARN_ON(path->nodes[level]);
5181 path->nodes[level] = cur;
5182 path->locks[level] = BTRFS_READ_LOCK;
5183
5184 if (btrfs_header_generation(cur) < min_trans) {
5185 ret = 1;
5186 goto out;
5187 }
5188 while (1) {
5189 nritems = btrfs_header_nritems(cur);
5190 level = btrfs_header_level(cur);
Googler9398cc32022-12-02 17:21:52 +08005191 sret = btrfs_bin_search(cur, min_key, level, &slot);
5192 if (sret < 0) {
5193 ret = sret;
5194 goto out;
5195 }
Googleraf606d22022-10-26 21:40:12 -07005196
5197 /* at the lowest level, we're done, setup the path and exit */
5198 if (level == path->lowest_level) {
5199 if (slot >= nritems)
5200 goto find_next_key;
5201 ret = 0;
5202 path->slots[level] = slot;
5203 btrfs_item_key_to_cpu(cur, &found_key, slot);
5204 goto out;
5205 }
5206 if (sret && slot > 0)
5207 slot--;
5208 /*
5209 * check this node pointer against the min_trans parameters.
Googlerb48fa912023-03-17 12:40:29 +05305210 * If it is too old, old, skip to the next one.
Googleraf606d22022-10-26 21:40:12 -07005211 */
5212 while (slot < nritems) {
5213 u64 gen;
5214
5215 gen = btrfs_node_ptr_generation(cur, slot);
5216 if (gen < min_trans) {
5217 slot++;
5218 continue;
5219 }
5220 break;
5221 }
5222find_next_key:
5223 /*
5224 * we didn't find a candidate key in this node, walk forward
5225 * and find another one
5226 */
5227 if (slot >= nritems) {
5228 path->slots[level] = slot;
5229 btrfs_set_path_blocking(path);
5230 sret = btrfs_find_next_key(root, path, min_key, level,
5231 min_trans);
5232 if (sret == 0) {
5233 btrfs_release_path(path);
5234 goto again;
5235 } else {
5236 goto out;
5237 }
5238 }
5239 /* save our key for returning back */
5240 btrfs_node_key_to_cpu(cur, &found_key, slot);
5241 path->slots[level] = slot;
5242 if (level == path->lowest_level) {
5243 ret = 0;
5244 goto out;
5245 }
5246 btrfs_set_path_blocking(path);
Googler9398cc32022-12-02 17:21:52 +08005247 cur = btrfs_read_node_slot(cur, slot);
Googleraf606d22022-10-26 21:40:12 -07005248 if (IS_ERR(cur)) {
5249 ret = PTR_ERR(cur);
5250 goto out;
5251 }
5252
5253 btrfs_tree_read_lock(cur);
5254
5255 path->locks[level - 1] = BTRFS_READ_LOCK;
5256 path->nodes[level - 1] = cur;
5257 unlock_up(path, level, 1, 0, NULL);
Googleraf606d22022-10-26 21:40:12 -07005258 }
5259out:
5260 path->keep_locks = keep_locks;
5261 if (ret == 0) {
5262 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5263 btrfs_set_path_blocking(path);
5264 memcpy(min_key, &found_key, sizeof(found_key));
5265 }
5266 return ret;
5267}
5268
Googleraf606d22022-10-26 21:40:12 -07005269/*
5270 * this is similar to btrfs_next_leaf, but does not try to preserve
5271 * and fixup the path. It looks for and returns the next key in the
5272 * tree based on the current path and the min_trans parameters.
5273 *
5274 * 0 is returned if another key is found, < 0 if there are any errors
5275 * and 1 is returned if there are no higher keys in the tree
5276 *
5277 * path->keep_locks should be set to 1 on the search made before
5278 * calling this function.
5279 */
5280int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5281 struct btrfs_key *key, int level, u64 min_trans)
5282{
5283 int slot;
5284 struct extent_buffer *c;
5285
Googler9398cc32022-12-02 17:21:52 +08005286 WARN_ON(!path->keep_locks && !path->skip_locking);
Googleraf606d22022-10-26 21:40:12 -07005287 while (level < BTRFS_MAX_LEVEL) {
5288 if (!path->nodes[level])
5289 return 1;
5290
5291 slot = path->slots[level] + 1;
5292 c = path->nodes[level];
5293next:
5294 if (slot >= btrfs_header_nritems(c)) {
5295 int ret;
5296 int orig_lowest;
5297 struct btrfs_key cur_key;
5298 if (level + 1 >= BTRFS_MAX_LEVEL ||
5299 !path->nodes[level + 1])
5300 return 1;
5301
Googler9398cc32022-12-02 17:21:52 +08005302 if (path->locks[level + 1] || path->skip_locking) {
Googleraf606d22022-10-26 21:40:12 -07005303 level++;
5304 continue;
5305 }
5306
5307 slot = btrfs_header_nritems(c) - 1;
5308 if (level == 0)
5309 btrfs_item_key_to_cpu(c, &cur_key, slot);
5310 else
5311 btrfs_node_key_to_cpu(c, &cur_key, slot);
5312
5313 orig_lowest = path->lowest_level;
5314 btrfs_release_path(path);
5315 path->lowest_level = level;
5316 ret = btrfs_search_slot(NULL, root, &cur_key, path,
5317 0, 0);
5318 path->lowest_level = orig_lowest;
5319 if (ret < 0)
5320 return ret;
5321
5322 c = path->nodes[level];
5323 slot = path->slots[level];
5324 if (ret == 0)
5325 slot++;
5326 goto next;
5327 }
5328
5329 if (level == 0)
5330 btrfs_item_key_to_cpu(c, key, slot);
5331 else {
5332 u64 gen = btrfs_node_ptr_generation(c, slot);
5333
5334 if (gen < min_trans) {
5335 slot++;
5336 goto next;
5337 }
5338 btrfs_node_key_to_cpu(c, key, slot);
5339 }
5340 return 0;
5341 }
5342 return 1;
5343}
5344
5345/*
5346 * search the tree again to find a leaf with greater keys
5347 * returns 0 if it found something or 1 if there are no greater leaves.
5348 * returns < 0 on io errors.
5349 */
5350int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5351{
5352 return btrfs_next_old_leaf(root, path, 0);
5353}
5354
5355int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5356 u64 time_seq)
5357{
5358 int slot;
5359 int level;
5360 struct extent_buffer *c;
5361 struct extent_buffer *next;
5362 struct btrfs_key key;
5363 u32 nritems;
5364 int ret;
5365 int old_spinning = path->leave_spinning;
5366 int next_rw_lock = 0;
5367
5368 nritems = btrfs_header_nritems(path->nodes[0]);
5369 if (nritems == 0)
5370 return 1;
5371
5372 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5373again:
5374 level = 1;
5375 next = NULL;
5376 next_rw_lock = 0;
5377 btrfs_release_path(path);
5378
5379 path->keep_locks = 1;
5380 path->leave_spinning = 1;
5381
5382 if (time_seq)
5383 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5384 else
5385 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5386 path->keep_locks = 0;
5387
5388 if (ret < 0)
5389 return ret;
5390
5391 nritems = btrfs_header_nritems(path->nodes[0]);
5392 /*
5393 * by releasing the path above we dropped all our locks. A balance
5394 * could have added more items next to the key that used to be
5395 * at the very end of the block. So, check again here and
5396 * advance the path if there are now more items available.
5397 */
5398 if (nritems > 0 && path->slots[0] < nritems - 1) {
5399 if (ret == 0)
5400 path->slots[0]++;
5401 ret = 0;
5402 goto done;
5403 }
5404 /*
5405 * So the above check misses one case:
5406 * - after releasing the path above, someone has removed the item that
5407 * used to be at the very end of the block, and balance between leafs
5408 * gets another one with bigger key.offset to replace it.
5409 *
5410 * This one should be returned as well, or we can get leaf corruption
5411 * later(esp. in __btrfs_drop_extents()).
5412 *
5413 * And a bit more explanation about this check,
5414 * with ret > 0, the key isn't found, the path points to the slot
5415 * where it should be inserted, so the path->slots[0] item must be the
5416 * bigger one.
5417 */
5418 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5419 ret = 0;
5420 goto done;
5421 }
5422
5423 while (level < BTRFS_MAX_LEVEL) {
5424 if (!path->nodes[level]) {
5425 ret = 1;
5426 goto done;
5427 }
5428
5429 slot = path->slots[level] + 1;
5430 c = path->nodes[level];
5431 if (slot >= btrfs_header_nritems(c)) {
5432 level++;
5433 if (level == BTRFS_MAX_LEVEL) {
5434 ret = 1;
5435 goto done;
5436 }
5437 continue;
5438 }
5439
5440 if (next) {
5441 btrfs_tree_unlock_rw(next, next_rw_lock);
5442 free_extent_buffer(next);
5443 }
5444
5445 next = c;
5446 next_rw_lock = path->locks[level];
Googler9398cc32022-12-02 17:21:52 +08005447 ret = read_block_for_search(root, path, &next, level,
5448 slot, &key);
Googleraf606d22022-10-26 21:40:12 -07005449 if (ret == -EAGAIN)
5450 goto again;
5451
5452 if (ret < 0) {
5453 btrfs_release_path(path);
5454 goto done;
5455 }
5456
5457 if (!path->skip_locking) {
5458 ret = btrfs_try_tree_read_lock(next);
5459 if (!ret && time_seq) {
5460 /*
5461 * If we don't get the lock, we may be racing
5462 * with push_leaf_left, holding that lock while
5463 * itself waiting for the leaf we've currently
5464 * locked. To solve this situation, we give up
5465 * on our lock and cycle.
5466 */
5467 free_extent_buffer(next);
5468 btrfs_release_path(path);
5469 cond_resched();
5470 goto again;
5471 }
5472 if (!ret) {
5473 btrfs_set_path_blocking(path);
5474 btrfs_tree_read_lock(next);
Googleraf606d22022-10-26 21:40:12 -07005475 }
5476 next_rw_lock = BTRFS_READ_LOCK;
5477 }
5478 break;
5479 }
5480 path->slots[level] = slot;
5481 while (1) {
5482 level--;
5483 c = path->nodes[level];
5484 if (path->locks[level])
5485 btrfs_tree_unlock_rw(c, path->locks[level]);
5486
5487 free_extent_buffer(c);
5488 path->nodes[level] = next;
5489 path->slots[level] = 0;
5490 if (!path->skip_locking)
5491 path->locks[level] = next_rw_lock;
5492 if (!level)
5493 break;
5494
Googler9398cc32022-12-02 17:21:52 +08005495 ret = read_block_for_search(root, path, &next, level,
5496 0, &key);
Googleraf606d22022-10-26 21:40:12 -07005497 if (ret == -EAGAIN)
5498 goto again;
5499
5500 if (ret < 0) {
5501 btrfs_release_path(path);
5502 goto done;
5503 }
5504
5505 if (!path->skip_locking) {
5506 ret = btrfs_try_tree_read_lock(next);
5507 if (!ret) {
5508 btrfs_set_path_blocking(path);
5509 btrfs_tree_read_lock(next);
Googleraf606d22022-10-26 21:40:12 -07005510 }
5511 next_rw_lock = BTRFS_READ_LOCK;
5512 }
5513 }
5514 ret = 0;
5515done:
5516 unlock_up(path, 0, 1, 0, NULL);
5517 path->leave_spinning = old_spinning;
5518 if (!old_spinning)
5519 btrfs_set_path_blocking(path);
5520
5521 return ret;
5522}
5523
5524/*
5525 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5526 * searching until it gets past min_objectid or finds an item of 'type'
5527 *
5528 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5529 */
5530int btrfs_previous_item(struct btrfs_root *root,
5531 struct btrfs_path *path, u64 min_objectid,
5532 int type)
5533{
5534 struct btrfs_key found_key;
5535 struct extent_buffer *leaf;
5536 u32 nritems;
5537 int ret;
5538
5539 while (1) {
5540 if (path->slots[0] == 0) {
5541 btrfs_set_path_blocking(path);
5542 ret = btrfs_prev_leaf(root, path);
5543 if (ret != 0)
5544 return ret;
5545 } else {
5546 path->slots[0]--;
5547 }
5548 leaf = path->nodes[0];
5549 nritems = btrfs_header_nritems(leaf);
5550 if (nritems == 0)
5551 return 1;
5552 if (path->slots[0] == nritems)
5553 path->slots[0]--;
5554
5555 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5556 if (found_key.objectid < min_objectid)
5557 break;
5558 if (found_key.type == type)
5559 return 0;
5560 if (found_key.objectid == min_objectid &&
5561 found_key.type < type)
5562 break;
5563 }
5564 return 1;
5565}
5566
5567/*
5568 * search in extent tree to find a previous Metadata/Data extent item with
5569 * min objecitd.
5570 *
5571 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5572 */
5573int btrfs_previous_extent_item(struct btrfs_root *root,
5574 struct btrfs_path *path, u64 min_objectid)
5575{
5576 struct btrfs_key found_key;
5577 struct extent_buffer *leaf;
5578 u32 nritems;
5579 int ret;
5580
5581 while (1) {
5582 if (path->slots[0] == 0) {
5583 btrfs_set_path_blocking(path);
5584 ret = btrfs_prev_leaf(root, path);
5585 if (ret != 0)
5586 return ret;
5587 } else {
5588 path->slots[0]--;
5589 }
5590 leaf = path->nodes[0];
5591 nritems = btrfs_header_nritems(leaf);
5592 if (nritems == 0)
5593 return 1;
5594 if (path->slots[0] == nritems)
5595 path->slots[0]--;
5596
5597 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5598 if (found_key.objectid < min_objectid)
5599 break;
5600 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5601 found_key.type == BTRFS_METADATA_ITEM_KEY)
5602 return 0;
5603 if (found_key.objectid == min_objectid &&
5604 found_key.type < BTRFS_EXTENT_ITEM_KEY)
5605 break;
5606 }
5607 return 1;
5608}