1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007,2008 Oracle. All rights reserved.
4 */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15
16 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
17 *root, struct btrfs_path *path, int level);
18 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
19 const struct btrfs_key *ins_key, struct btrfs_path *path,
20 int data_size, int extend);
21 static int push_node_left(struct btrfs_trans_handle *trans,
22 struct btrfs_fs_info *fs_info,
23 struct extent_buffer *dst,
24 struct extent_buffer *src, int empty);
25 static int balance_node_right(struct btrfs_trans_handle *trans,
26 struct btrfs_fs_info *fs_info,
27 struct extent_buffer *dst_buf,
28 struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30 int level, int slot);
31
btrfs_alloc_path(void)32 struct btrfs_path *btrfs_alloc_path(void)
33 {
34 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
35 }
36
37 /*
38 * set all locked nodes in the path to blocking locks. This should
39 * be done before scheduling
40 */
btrfs_set_path_blocking(struct btrfs_path * p)41 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
42 {
43 int i;
44 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
45 if (!p->nodes[i] || !p->locks[i])
46 continue;
47 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
48 if (p->locks[i] == BTRFS_READ_LOCK)
49 p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
50 else if (p->locks[i] == BTRFS_WRITE_LOCK)
51 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
52 }
53 }
54
55 /*
56 * reset all the locked nodes in the patch to spinning locks.
57 *
58 * held is used to keep lockdep happy, when lockdep is enabled
59 * we set held to a blocking lock before we go around and
60 * retake all the spinlocks in the path. You can safely use NULL
61 * for held
62 */
btrfs_clear_path_blocking(struct btrfs_path * p,struct extent_buffer * held,int held_rw)63 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
64 struct extent_buffer *held, int held_rw)
65 {
66 int i;
67
68 if (held) {
69 btrfs_set_lock_blocking_rw(held, held_rw);
70 if (held_rw == BTRFS_WRITE_LOCK)
71 held_rw = BTRFS_WRITE_LOCK_BLOCKING;
72 else if (held_rw == BTRFS_READ_LOCK)
73 held_rw = BTRFS_READ_LOCK_BLOCKING;
74 }
75 btrfs_set_path_blocking(p);
76
77 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
78 if (p->nodes[i] && p->locks[i]) {
79 btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
80 if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
81 p->locks[i] = BTRFS_WRITE_LOCK;
82 else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
83 p->locks[i] = BTRFS_READ_LOCK;
84 }
85 }
86
87 if (held)
88 btrfs_clear_lock_blocking_rw(held, held_rw);
89 }
90
91 /* this also releases the path */
btrfs_free_path(struct btrfs_path * p)92 void btrfs_free_path(struct btrfs_path *p)
93 {
94 if (!p)
95 return;
96 btrfs_release_path(p);
97 kmem_cache_free(btrfs_path_cachep, p);
98 }
99
100 /*
101 * path release drops references on the extent buffers in the path
102 * and it drops any locks held by this path
103 *
104 * It is safe to call this on paths that no locks or extent buffers held.
105 */
btrfs_release_path(struct btrfs_path * p)106 noinline void btrfs_release_path(struct btrfs_path *p)
107 {
108 int i;
109
110 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
111 p->slots[i] = 0;
112 if (!p->nodes[i])
113 continue;
114 if (p->locks[i]) {
115 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
116 p->locks[i] = 0;
117 }
118 free_extent_buffer(p->nodes[i]);
119 p->nodes[i] = NULL;
120 }
121 }
122
123 /*
124 * safely gets a reference on the root node of a tree. A lock
125 * is not taken, so a concurrent writer may put a different node
126 * at the root of the tree. See btrfs_lock_root_node for the
127 * looping required.
128 *
129 * The extent buffer returned by this has a reference taken, so
130 * it won't disappear. It may stop being the root of the tree
131 * at any time because there are no locks held.
132 */
btrfs_root_node(struct btrfs_root * root)133 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
134 {
135 struct extent_buffer *eb;
136
137 while (1) {
138 rcu_read_lock();
139 eb = rcu_dereference(root->node);
140
141 /*
142 * RCU really hurts here, we could free up the root node because
143 * it was COWed but we may not get the new root node yet so do
144 * the inc_not_zero dance and if it doesn't work then
145 * synchronize_rcu and try again.
146 */
147 if (atomic_inc_not_zero(&eb->refs)) {
148 rcu_read_unlock();
149 break;
150 }
151 rcu_read_unlock();
152 synchronize_rcu();
153 }
154 return eb;
155 }
156
157 /* loop around taking references on and locking the root node of the
158 * tree until you end up with a lock on the root. A locked buffer
159 * is returned, with a reference held.
160 */
btrfs_lock_root_node(struct btrfs_root * root)161 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
162 {
163 struct extent_buffer *eb;
164
165 while (1) {
166 eb = btrfs_root_node(root);
167 btrfs_tree_lock(eb);
168 if (eb == root->node)
169 break;
170 btrfs_tree_unlock(eb);
171 free_extent_buffer(eb);
172 }
173 return eb;
174 }
175
176 /* loop around taking references on and locking the root node of the
177 * tree until you end up with a lock on the root. A locked buffer
178 * is returned, with a reference held.
179 */
btrfs_read_lock_root_node(struct btrfs_root * root)180 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
181 {
182 struct extent_buffer *eb;
183
184 while (1) {
185 eb = btrfs_root_node(root);
186 btrfs_tree_read_lock(eb);
187 if (eb == root->node)
188 break;
189 btrfs_tree_read_unlock(eb);
190 free_extent_buffer(eb);
191 }
192 return eb;
193 }
194
195 /* cowonly root (everything not a reference counted cow subvolume), just get
196 * put onto a simple dirty list. transaction.c walks this to make sure they
197 * get properly updated on disk.
198 */
add_root_to_dirty_list(struct btrfs_root * root)199 static void add_root_to_dirty_list(struct btrfs_root *root)
200 {
201 struct btrfs_fs_info *fs_info = root->fs_info;
202
203 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
204 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
205 return;
206
207 spin_lock(&fs_info->trans_lock);
208 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
209 /* Want the extent tree to be the last on the list */
210 if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
211 list_move_tail(&root->dirty_list,
212 &fs_info->dirty_cowonly_roots);
213 else
214 list_move(&root->dirty_list,
215 &fs_info->dirty_cowonly_roots);
216 }
217 spin_unlock(&fs_info->trans_lock);
218 }
219
220 /*
221 * used by snapshot creation to make a copy of a root for a tree with
222 * a given objectid. The buffer with the new root node is returned in
223 * cow_ret, and this func returns zero on success or a negative error code.
224 */
btrfs_copy_root(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer ** cow_ret,u64 new_root_objectid)225 int btrfs_copy_root(struct btrfs_trans_handle *trans,
226 struct btrfs_root *root,
227 struct extent_buffer *buf,
228 struct extent_buffer **cow_ret, u64 new_root_objectid)
229 {
230 struct btrfs_fs_info *fs_info = root->fs_info;
231 struct extent_buffer *cow;
232 int ret = 0;
233 int level;
234 struct btrfs_disk_key disk_key;
235
236 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
237 trans->transid != fs_info->running_transaction->transid);
238 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
239 trans->transid != root->last_trans);
240
241 level = btrfs_header_level(buf);
242 if (level == 0)
243 btrfs_item_key(buf, &disk_key, 0);
244 else
245 btrfs_node_key(buf, &disk_key, 0);
246
247 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
248 &disk_key, level, buf->start, 0);
249 if (IS_ERR(cow))
250 return PTR_ERR(cow);
251
252 copy_extent_buffer_full(cow, buf);
253 btrfs_set_header_bytenr(cow, cow->start);
254 btrfs_set_header_generation(cow, trans->transid);
255 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
256 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
257 BTRFS_HEADER_FLAG_RELOC);
258 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
259 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
260 else
261 btrfs_set_header_owner(cow, new_root_objectid);
262
263 write_extent_buffer_fsid(cow, fs_info->fsid);
264
265 WARN_ON(btrfs_header_generation(buf) > trans->transid);
266 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
267 ret = btrfs_inc_ref(trans, root, cow, 1);
268 else
269 ret = btrfs_inc_ref(trans, root, cow, 0);
270 if (ret) {
271 btrfs_tree_unlock(cow);
272 free_extent_buffer(cow);
273 btrfs_abort_transaction(trans, ret);
274 return ret;
275 }
276
277 btrfs_mark_buffer_dirty(cow);
278 *cow_ret = cow;
279 return 0;
280 }
281
282 enum mod_log_op {
283 MOD_LOG_KEY_REPLACE,
284 MOD_LOG_KEY_ADD,
285 MOD_LOG_KEY_REMOVE,
286 MOD_LOG_KEY_REMOVE_WHILE_FREEING,
287 MOD_LOG_KEY_REMOVE_WHILE_MOVING,
288 MOD_LOG_MOVE_KEYS,
289 MOD_LOG_ROOT_REPLACE,
290 };
291
292 struct tree_mod_root {
293 u64 logical;
294 u8 level;
295 };
296
297 struct tree_mod_elem {
298 struct rb_node node;
299 u64 logical;
300 u64 seq;
301 enum mod_log_op op;
302
303 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
304 int slot;
305
306 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
307 u64 generation;
308
309 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
310 struct btrfs_disk_key key;
311 u64 blockptr;
312
313 /* this is used for op == MOD_LOG_MOVE_KEYS */
314 struct {
315 int dst_slot;
316 int nr_items;
317 } move;
318
319 /* this is used for op == MOD_LOG_ROOT_REPLACE */
320 struct tree_mod_root old_root;
321 };
322
323 /*
324 * Pull a new tree mod seq number for our operation.
325 */
btrfs_inc_tree_mod_seq(struct btrfs_fs_info * fs_info)326 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
327 {
328 return atomic64_inc_return(&fs_info->tree_mod_seq);
329 }
330
331 /*
332 * This adds a new blocker to the tree mod log's blocker list if the @elem
333 * passed does not already have a sequence number set. So when a caller expects
334 * to record tree modifications, it should ensure to set elem->seq to zero
335 * before calling btrfs_get_tree_mod_seq.
336 * Returns a fresh, unused tree log modification sequence number, even if no new
337 * blocker was added.
338 */
btrfs_get_tree_mod_seq(struct btrfs_fs_info * fs_info,struct seq_list * elem)339 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
340 struct seq_list *elem)
341 {
342 write_lock(&fs_info->tree_mod_log_lock);
343 if (!elem->seq) {
344 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
345 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
346 }
347 write_unlock(&fs_info->tree_mod_log_lock);
348
349 return elem->seq;
350 }
351
btrfs_put_tree_mod_seq(struct btrfs_fs_info * fs_info,struct seq_list * elem)352 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
353 struct seq_list *elem)
354 {
355 struct rb_root *tm_root;
356 struct rb_node *node;
357 struct rb_node *next;
358 struct seq_list *cur_elem;
359 struct tree_mod_elem *tm;
360 u64 min_seq = (u64)-1;
361 u64 seq_putting = elem->seq;
362
363 if (!seq_putting)
364 return;
365
366 write_lock(&fs_info->tree_mod_log_lock);
367 list_del(&elem->list);
368 elem->seq = 0;
369
370 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
371 if (cur_elem->seq < min_seq) {
372 if (seq_putting > cur_elem->seq) {
373 /*
374 * blocker with lower sequence number exists, we
375 * cannot remove anything from the log
376 */
377 write_unlock(&fs_info->tree_mod_log_lock);
378 return;
379 }
380 min_seq = cur_elem->seq;
381 }
382 }
383
384 /*
385 * anything that's lower than the lowest existing (read: blocked)
386 * sequence number can be removed from the tree.
387 */
388 tm_root = &fs_info->tree_mod_log;
389 for (node = rb_first(tm_root); node; node = next) {
390 next = rb_next(node);
391 tm = rb_entry(node, struct tree_mod_elem, node);
392 if (tm->seq >= min_seq)
393 continue;
394 rb_erase(node, tm_root);
395 kfree(tm);
396 }
397 write_unlock(&fs_info->tree_mod_log_lock);
398 }
399
400 /*
401 * key order of the log:
402 * node/leaf start address -> sequence
403 *
404 * The 'start address' is the logical address of the *new* root node
405 * for root replace operations, or the logical address of the affected
406 * block for all other operations.
407 *
408 * Note: must be called with write lock for fs_info::tree_mod_log_lock.
409 */
410 static noinline int
__tree_mod_log_insert(struct btrfs_fs_info * fs_info,struct tree_mod_elem * tm)411 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
412 {
413 struct rb_root *tm_root;
414 struct rb_node **new;
415 struct rb_node *parent = NULL;
416 struct tree_mod_elem *cur;
417
418 tm->seq = btrfs_inc_tree_mod_seq(fs_info);
419
420 tm_root = &fs_info->tree_mod_log;
421 new = &tm_root->rb_node;
422 while (*new) {
423 cur = rb_entry(*new, struct tree_mod_elem, node);
424 parent = *new;
425 if (cur->logical < tm->logical)
426 new = &((*new)->rb_left);
427 else if (cur->logical > tm->logical)
428 new = &((*new)->rb_right);
429 else if (cur->seq < tm->seq)
430 new = &((*new)->rb_left);
431 else if (cur->seq > tm->seq)
432 new = &((*new)->rb_right);
433 else
434 return -EEXIST;
435 }
436
437 rb_link_node(&tm->node, parent, new);
438 rb_insert_color(&tm->node, tm_root);
439 return 0;
440 }
441
442 /*
443 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
444 * returns zero with the tree_mod_log_lock acquired. The caller must hold
445 * this until all tree mod log insertions are recorded in the rb tree and then
446 * write unlock fs_info::tree_mod_log_lock.
447 */
tree_mod_dont_log(struct btrfs_fs_info * fs_info,struct extent_buffer * eb)448 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
449 struct extent_buffer *eb) {
450 smp_mb();
451 if (list_empty(&(fs_info)->tree_mod_seq_list))
452 return 1;
453 if (eb && btrfs_header_level(eb) == 0)
454 return 1;
455
456 write_lock(&fs_info->tree_mod_log_lock);
457 if (list_empty(&(fs_info)->tree_mod_seq_list)) {
458 write_unlock(&fs_info->tree_mod_log_lock);
459 return 1;
460 }
461
462 return 0;
463 }
464
465 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
tree_mod_need_log(const struct btrfs_fs_info * fs_info,struct extent_buffer * eb)466 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
467 struct extent_buffer *eb)
468 {
469 smp_mb();
470 if (list_empty(&(fs_info)->tree_mod_seq_list))
471 return 0;
472 if (eb && btrfs_header_level(eb) == 0)
473 return 0;
474
475 return 1;
476 }
477
478 static struct tree_mod_elem *
alloc_tree_mod_elem(struct extent_buffer * eb,int slot,enum mod_log_op op,gfp_t flags)479 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
480 enum mod_log_op op, gfp_t flags)
481 {
482 struct tree_mod_elem *tm;
483
484 tm = kzalloc(sizeof(*tm), flags);
485 if (!tm)
486 return NULL;
487
488 tm->logical = eb->start;
489 if (op != MOD_LOG_KEY_ADD) {
490 btrfs_node_key(eb, &tm->key, slot);
491 tm->blockptr = btrfs_node_blockptr(eb, slot);
492 }
493 tm->op = op;
494 tm->slot = slot;
495 tm->generation = btrfs_node_ptr_generation(eb, slot);
496 RB_CLEAR_NODE(&tm->node);
497
498 return tm;
499 }
500
tree_mod_log_insert_key(struct extent_buffer * eb,int slot,enum mod_log_op op,gfp_t flags)501 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
502 enum mod_log_op op, gfp_t flags)
503 {
504 struct tree_mod_elem *tm;
505 int ret;
506
507 if (!tree_mod_need_log(eb->fs_info, eb))
508 return 0;
509
510 tm = alloc_tree_mod_elem(eb, slot, op, flags);
511 if (!tm)
512 return -ENOMEM;
513
514 if (tree_mod_dont_log(eb->fs_info, eb)) {
515 kfree(tm);
516 return 0;
517 }
518
519 ret = __tree_mod_log_insert(eb->fs_info, tm);
520 write_unlock(&eb->fs_info->tree_mod_log_lock);
521 if (ret)
522 kfree(tm);
523
524 return ret;
525 }
526
tree_mod_log_insert_move(struct extent_buffer * eb,int dst_slot,int src_slot,int nr_items)527 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
528 int dst_slot, int src_slot, int nr_items)
529 {
530 struct tree_mod_elem *tm = NULL;
531 struct tree_mod_elem **tm_list = NULL;
532 int ret = 0;
533 int i;
534 int locked = 0;
535
536 if (!tree_mod_need_log(eb->fs_info, eb))
537 return 0;
538
539 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
540 if (!tm_list)
541 return -ENOMEM;
542
543 tm = kzalloc(sizeof(*tm), GFP_NOFS);
544 if (!tm) {
545 ret = -ENOMEM;
546 goto free_tms;
547 }
548
549 tm->logical = eb->start;
550 tm->slot = src_slot;
551 tm->move.dst_slot = dst_slot;
552 tm->move.nr_items = nr_items;
553 tm->op = MOD_LOG_MOVE_KEYS;
554
555 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
556 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
557 MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
558 if (!tm_list[i]) {
559 ret = -ENOMEM;
560 goto free_tms;
561 }
562 }
563
564 if (tree_mod_dont_log(eb->fs_info, eb))
565 goto free_tms;
566 locked = 1;
567
568 /*
569 * When we override something during the move, we log these removals.
570 * This can only happen when we move towards the beginning of the
571 * buffer, i.e. dst_slot < src_slot.
572 */
573 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
574 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
575 if (ret)
576 goto free_tms;
577 }
578
579 ret = __tree_mod_log_insert(eb->fs_info, tm);
580 if (ret)
581 goto free_tms;
582 write_unlock(&eb->fs_info->tree_mod_log_lock);
583 kfree(tm_list);
584
585 return 0;
586 free_tms:
587 for (i = 0; i < nr_items; i++) {
588 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
589 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
590 kfree(tm_list[i]);
591 }
592 if (locked)
593 write_unlock(&eb->fs_info->tree_mod_log_lock);
594 kfree(tm_list);
595 kfree(tm);
596
597 return ret;
598 }
599
600 static inline int
__tree_mod_log_free_eb(struct btrfs_fs_info * fs_info,struct tree_mod_elem ** tm_list,int nritems)601 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
602 struct tree_mod_elem **tm_list,
603 int nritems)
604 {
605 int i, j;
606 int ret;
607
608 for (i = nritems - 1; i >= 0; i--) {
609 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
610 if (ret) {
611 for (j = nritems - 1; j > i; j--)
612 rb_erase(&tm_list[j]->node,
613 &fs_info->tree_mod_log);
614 return ret;
615 }
616 }
617
618 return 0;
619 }
620
tree_mod_log_insert_root(struct extent_buffer * old_root,struct extent_buffer * new_root,int log_removal)621 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
622 struct extent_buffer *new_root, int log_removal)
623 {
624 struct btrfs_fs_info *fs_info = old_root->fs_info;
625 struct tree_mod_elem *tm = NULL;
626 struct tree_mod_elem **tm_list = NULL;
627 int nritems = 0;
628 int ret = 0;
629 int i;
630
631 if (!tree_mod_need_log(fs_info, NULL))
632 return 0;
633
634 if (log_removal && btrfs_header_level(old_root) > 0) {
635 nritems = btrfs_header_nritems(old_root);
636 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
637 GFP_NOFS);
638 if (!tm_list) {
639 ret = -ENOMEM;
640 goto free_tms;
641 }
642 for (i = 0; i < nritems; i++) {
643 tm_list[i] = alloc_tree_mod_elem(old_root, i,
644 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
645 if (!tm_list[i]) {
646 ret = -ENOMEM;
647 goto free_tms;
648 }
649 }
650 }
651
652 tm = kzalloc(sizeof(*tm), GFP_NOFS);
653 if (!tm) {
654 ret = -ENOMEM;
655 goto free_tms;
656 }
657
658 tm->logical = new_root->start;
659 tm->old_root.logical = old_root->start;
660 tm->old_root.level = btrfs_header_level(old_root);
661 tm->generation = btrfs_header_generation(old_root);
662 tm->op = MOD_LOG_ROOT_REPLACE;
663
664 if (tree_mod_dont_log(fs_info, NULL))
665 goto free_tms;
666
667 if (tm_list)
668 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
669 if (!ret)
670 ret = __tree_mod_log_insert(fs_info, tm);
671
672 write_unlock(&fs_info->tree_mod_log_lock);
673 if (ret)
674 goto free_tms;
675 kfree(tm_list);
676
677 return ret;
678
679 free_tms:
680 if (tm_list) {
681 for (i = 0; i < nritems; i++)
682 kfree(tm_list[i]);
683 kfree(tm_list);
684 }
685 kfree(tm);
686
687 return ret;
688 }
689
690 static struct tree_mod_elem *
__tree_mod_log_search(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq,int smallest)691 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
692 int smallest)
693 {
694 struct rb_root *tm_root;
695 struct rb_node *node;
696 struct tree_mod_elem *cur = NULL;
697 struct tree_mod_elem *found = NULL;
698
699 read_lock(&fs_info->tree_mod_log_lock);
700 tm_root = &fs_info->tree_mod_log;
701 node = tm_root->rb_node;
702 while (node) {
703 cur = rb_entry(node, struct tree_mod_elem, node);
704 if (cur->logical < start) {
705 node = node->rb_left;
706 } else if (cur->logical > start) {
707 node = node->rb_right;
708 } else if (cur->seq < min_seq) {
709 node = node->rb_left;
710 } else if (!smallest) {
711 /* we want the node with the highest seq */
712 if (found)
713 BUG_ON(found->seq > cur->seq);
714 found = cur;
715 node = node->rb_left;
716 } else if (cur->seq > min_seq) {
717 /* we want the node with the smallest seq */
718 if (found)
719 BUG_ON(found->seq < cur->seq);
720 found = cur;
721 node = node->rb_right;
722 } else {
723 found = cur;
724 break;
725 }
726 }
727 read_unlock(&fs_info->tree_mod_log_lock);
728
729 return found;
730 }
731
732 /*
733 * this returns the element from the log with the smallest time sequence
734 * value that's in the log (the oldest log item). any element with a time
735 * sequence lower than min_seq will be ignored.
736 */
737 static struct tree_mod_elem *
tree_mod_log_search_oldest(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq)738 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
739 u64 min_seq)
740 {
741 return __tree_mod_log_search(fs_info, start, min_seq, 1);
742 }
743
744 /*
745 * this returns the element from the log with the largest time sequence
746 * value that's in the log (the most recent log item). any element with
747 * a time sequence lower than min_seq will be ignored.
748 */
749 static struct tree_mod_elem *
tree_mod_log_search(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq)750 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
751 {
752 return __tree_mod_log_search(fs_info, start, min_seq, 0);
753 }
754
755 static noinline int
tree_mod_log_eb_copy(struct btrfs_fs_info * fs_info,struct extent_buffer * dst,struct extent_buffer * src,unsigned long dst_offset,unsigned long src_offset,int nr_items)756 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
757 struct extent_buffer *src, unsigned long dst_offset,
758 unsigned long src_offset, int nr_items)
759 {
760 int ret = 0;
761 struct tree_mod_elem **tm_list = NULL;
762 struct tree_mod_elem **tm_list_add, **tm_list_rem;
763 int i;
764 int locked = 0;
765
766 if (!tree_mod_need_log(fs_info, NULL))
767 return 0;
768
769 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
770 return 0;
771
772 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
773 GFP_NOFS);
774 if (!tm_list)
775 return -ENOMEM;
776
777 tm_list_add = tm_list;
778 tm_list_rem = tm_list + nr_items;
779 for (i = 0; i < nr_items; i++) {
780 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
781 MOD_LOG_KEY_REMOVE, GFP_NOFS);
782 if (!tm_list_rem[i]) {
783 ret = -ENOMEM;
784 goto free_tms;
785 }
786
787 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
788 MOD_LOG_KEY_ADD, GFP_NOFS);
789 if (!tm_list_add[i]) {
790 ret = -ENOMEM;
791 goto free_tms;
792 }
793 }
794
795 if (tree_mod_dont_log(fs_info, NULL))
796 goto free_tms;
797 locked = 1;
798
799 for (i = 0; i < nr_items; i++) {
800 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
801 if (ret)
802 goto free_tms;
803 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
804 if (ret)
805 goto free_tms;
806 }
807
808 write_unlock(&fs_info->tree_mod_log_lock);
809 kfree(tm_list);
810
811 return 0;
812
813 free_tms:
814 for (i = 0; i < nr_items * 2; i++) {
815 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
816 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
817 kfree(tm_list[i]);
818 }
819 if (locked)
820 write_unlock(&fs_info->tree_mod_log_lock);
821 kfree(tm_list);
822
823 return ret;
824 }
825
tree_mod_log_free_eb(struct extent_buffer * eb)826 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
827 {
828 struct tree_mod_elem **tm_list = NULL;
829 int nritems = 0;
830 int i;
831 int ret = 0;
832
833 if (btrfs_header_level(eb) == 0)
834 return 0;
835
836 if (!tree_mod_need_log(eb->fs_info, NULL))
837 return 0;
838
839 nritems = btrfs_header_nritems(eb);
840 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
841 if (!tm_list)
842 return -ENOMEM;
843
844 for (i = 0; i < nritems; i++) {
845 tm_list[i] = alloc_tree_mod_elem(eb, i,
846 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
847 if (!tm_list[i]) {
848 ret = -ENOMEM;
849 goto free_tms;
850 }
851 }
852
853 if (tree_mod_dont_log(eb->fs_info, eb))
854 goto free_tms;
855
856 ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
857 write_unlock(&eb->fs_info->tree_mod_log_lock);
858 if (ret)
859 goto free_tms;
860 kfree(tm_list);
861
862 return 0;
863
864 free_tms:
865 for (i = 0; i < nritems; i++)
866 kfree(tm_list[i]);
867 kfree(tm_list);
868
869 return ret;
870 }
871
872 /*
873 * check if the tree block can be shared by multiple trees
874 */
btrfs_block_can_be_shared(struct btrfs_root * root,struct extent_buffer * buf)875 int btrfs_block_can_be_shared(struct btrfs_root *root,
876 struct extent_buffer *buf)
877 {
878 /*
879 * Tree blocks not in reference counted trees and tree roots
880 * are never shared. If a block was allocated after the last
881 * snapshot and the block was not allocated by tree relocation,
882 * we know the block is not shared.
883 */
884 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
885 buf != root->node && buf != root->commit_root &&
886 (btrfs_header_generation(buf) <=
887 btrfs_root_last_snapshot(&root->root_item) ||
888 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
889 return 1;
890
891 return 0;
892 }
893
update_ref_for_cow(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer * cow,int * last_ref)894 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
895 struct btrfs_root *root,
896 struct extent_buffer *buf,
897 struct extent_buffer *cow,
898 int *last_ref)
899 {
900 struct btrfs_fs_info *fs_info = root->fs_info;
901 u64 refs;
902 u64 owner;
903 u64 flags;
904 u64 new_flags = 0;
905 int ret;
906
907 /*
908 * Backrefs update rules:
909 *
910 * Always use full backrefs for extent pointers in tree block
911 * allocated by tree relocation.
912 *
913 * If a shared tree block is no longer referenced by its owner
914 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
915 * use full backrefs for extent pointers in tree block.
916 *
917 * If a tree block is been relocating
918 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
919 * use full backrefs for extent pointers in tree block.
920 * The reason for this is some operations (such as drop tree)
921 * are only allowed for blocks use full backrefs.
922 */
923
924 if (btrfs_block_can_be_shared(root, buf)) {
925 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
926 btrfs_header_level(buf), 1,
927 &refs, &flags);
928 if (ret)
929 return ret;
930 if (refs == 0) {
931 ret = -EROFS;
932 btrfs_handle_fs_error(fs_info, ret, NULL);
933 return ret;
934 }
935 } else {
936 refs = 1;
937 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
938 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
939 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
940 else
941 flags = 0;
942 }
943
944 owner = btrfs_header_owner(buf);
945 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
946 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
947
948 if (refs > 1) {
949 if ((owner == root->root_key.objectid ||
950 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
951 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
952 ret = btrfs_inc_ref(trans, root, buf, 1);
953 if (ret)
954 return ret;
955
956 if (root->root_key.objectid ==
957 BTRFS_TREE_RELOC_OBJECTID) {
958 ret = btrfs_dec_ref(trans, root, buf, 0);
959 if (ret)
960 return ret;
961 ret = btrfs_inc_ref(trans, root, cow, 1);
962 if (ret)
963 return ret;
964 }
965 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
966 } else {
967
968 if (root->root_key.objectid ==
969 BTRFS_TREE_RELOC_OBJECTID)
970 ret = btrfs_inc_ref(trans, root, cow, 1);
971 else
972 ret = btrfs_inc_ref(trans, root, cow, 0);
973 if (ret)
974 return ret;
975 }
976 if (new_flags != 0) {
977 int level = btrfs_header_level(buf);
978
979 ret = btrfs_set_disk_extent_flags(trans, fs_info,
980 buf->start,
981 buf->len,
982 new_flags, level, 0);
983 if (ret)
984 return ret;
985 }
986 } else {
987 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
988 if (root->root_key.objectid ==
989 BTRFS_TREE_RELOC_OBJECTID)
990 ret = btrfs_inc_ref(trans, root, cow, 1);
991 else
992 ret = btrfs_inc_ref(trans, root, cow, 0);
993 if (ret)
994 return ret;
995 ret = btrfs_dec_ref(trans, root, buf, 1);
996 if (ret)
997 return ret;
998 }
999 clean_tree_block(fs_info, buf);
1000 *last_ref = 1;
1001 }
1002 return 0;
1003 }
1004
alloc_tree_block_no_bg_flush(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 parent_start,const struct btrfs_disk_key * disk_key,int level,u64 hint,u64 empty_size)1005 static struct extent_buffer *alloc_tree_block_no_bg_flush(
1006 struct btrfs_trans_handle *trans,
1007 struct btrfs_root *root,
1008 u64 parent_start,
1009 const struct btrfs_disk_key *disk_key,
1010 int level,
1011 u64 hint,
1012 u64 empty_size)
1013 {
1014 struct btrfs_fs_info *fs_info = root->fs_info;
1015 struct extent_buffer *ret;
1016
1017 /*
1018 * If we are COWing a node/leaf from the extent, chunk, device or free
1019 * space trees, make sure that we do not finish block group creation of
1020 * pending block groups. We do this to avoid a deadlock.
1021 * COWing can result in allocation of a new chunk, and flushing pending
1022 * block groups (btrfs_create_pending_block_groups()) can be triggered
1023 * when finishing allocation of a new chunk. Creation of a pending block
1024 * group modifies the extent, chunk, device and free space trees,
1025 * therefore we could deadlock with ourselves since we are holding a
1026 * lock on an extent buffer that btrfs_create_pending_block_groups() may
1027 * try to COW later.
1028 * For similar reasons, we also need to delay flushing pending block
1029 * groups when splitting a leaf or node, from one of those trees, since
1030 * we are holding a write lock on it and its parent or when inserting a
1031 * new root node for one of those trees.
1032 */
1033 if (root == fs_info->extent_root ||
1034 root == fs_info->chunk_root ||
1035 root == fs_info->dev_root ||
1036 root == fs_info->free_space_root)
1037 trans->can_flush_pending_bgs = false;
1038
1039 ret = btrfs_alloc_tree_block(trans, root, parent_start,
1040 root->root_key.objectid, disk_key, level,
1041 hint, empty_size);
1042 trans->can_flush_pending_bgs = true;
1043
1044 return ret;
1045 }
1046
1047 /*
1048 * does the dirty work in cow of a single block. The parent block (if
1049 * supplied) is updated to point to the new cow copy. The new buffer is marked
1050 * dirty and returned locked. If you modify the block it needs to be marked
1051 * dirty again.
1052 *
1053 * search_start -- an allocation hint for the new block
1054 *
1055 * empty_size -- a hint that you plan on doing more cow. This is the size in
1056 * bytes the allocator should try to find free next to the block it returns.
1057 * This is just a hint and may be ignored by the allocator.
1058 */
__btrfs_cow_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer * parent,int parent_slot,struct extent_buffer ** cow_ret,u64 search_start,u64 empty_size)1059 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1060 struct btrfs_root *root,
1061 struct extent_buffer *buf,
1062 struct extent_buffer *parent, int parent_slot,
1063 struct extent_buffer **cow_ret,
1064 u64 search_start, u64 empty_size)
1065 {
1066 struct btrfs_fs_info *fs_info = root->fs_info;
1067 struct btrfs_disk_key disk_key;
1068 struct extent_buffer *cow;
1069 int level, ret;
1070 int last_ref = 0;
1071 int unlock_orig = 0;
1072 u64 parent_start = 0;
1073
1074 if (*cow_ret == buf)
1075 unlock_orig = 1;
1076
1077 btrfs_assert_tree_locked(buf);
1078
1079 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1080 trans->transid != fs_info->running_transaction->transid);
1081 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1082 trans->transid != root->last_trans);
1083
1084 level = btrfs_header_level(buf);
1085
1086 if (level == 0)
1087 btrfs_item_key(buf, &disk_key, 0);
1088 else
1089 btrfs_node_key(buf, &disk_key, 0);
1090
1091 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1092 parent_start = parent->start;
1093
1094 cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1095 level, search_start, empty_size);
1096 if (IS_ERR(cow))
1097 return PTR_ERR(cow);
1098
1099 /* cow is set to blocking by btrfs_init_new_buffer */
1100
1101 copy_extent_buffer_full(cow, buf);
1102 btrfs_set_header_bytenr(cow, cow->start);
1103 btrfs_set_header_generation(cow, trans->transid);
1104 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1105 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1106 BTRFS_HEADER_FLAG_RELOC);
1107 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1108 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1109 else
1110 btrfs_set_header_owner(cow, root->root_key.objectid);
1111
1112 write_extent_buffer_fsid(cow, fs_info->fsid);
1113
1114 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1115 if (ret) {
1116 btrfs_tree_unlock(cow);
1117 free_extent_buffer(cow);
1118 btrfs_abort_transaction(trans, ret);
1119 return ret;
1120 }
1121
1122 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1123 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1124 if (ret) {
1125 btrfs_tree_unlock(cow);
1126 free_extent_buffer(cow);
1127 btrfs_abort_transaction(trans, ret);
1128 return ret;
1129 }
1130 }
1131
1132 if (buf == root->node) {
1133 WARN_ON(parent && parent != buf);
1134 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1135 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1136 parent_start = buf->start;
1137
1138 extent_buffer_get(cow);
1139 ret = tree_mod_log_insert_root(root->node, cow, 1);
1140 BUG_ON(ret < 0);
1141 rcu_assign_pointer(root->node, cow);
1142
1143 btrfs_free_tree_block(trans, root, buf, parent_start,
1144 last_ref);
1145 free_extent_buffer(buf);
1146 add_root_to_dirty_list(root);
1147 } else {
1148 WARN_ON(trans->transid != btrfs_header_generation(parent));
1149 tree_mod_log_insert_key(parent, parent_slot,
1150 MOD_LOG_KEY_REPLACE, GFP_NOFS);
1151 btrfs_set_node_blockptr(parent, parent_slot,
1152 cow->start);
1153 btrfs_set_node_ptr_generation(parent, parent_slot,
1154 trans->transid);
1155 btrfs_mark_buffer_dirty(parent);
1156 if (last_ref) {
1157 ret = tree_mod_log_free_eb(buf);
1158 if (ret) {
1159 btrfs_tree_unlock(cow);
1160 free_extent_buffer(cow);
1161 btrfs_abort_transaction(trans, ret);
1162 return ret;
1163 }
1164 }
1165 btrfs_free_tree_block(trans, root, buf, parent_start,
1166 last_ref);
1167 }
1168 if (unlock_orig)
1169 btrfs_tree_unlock(buf);
1170 free_extent_buffer_stale(buf);
1171 btrfs_mark_buffer_dirty(cow);
1172 *cow_ret = cow;
1173 return 0;
1174 }
1175
1176 /*
1177 * returns the logical address of the oldest predecessor of the given root.
1178 * entries older than time_seq are ignored.
1179 */
__tree_mod_log_oldest_root(struct extent_buffer * eb_root,u64 time_seq)1180 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1181 struct extent_buffer *eb_root, u64 time_seq)
1182 {
1183 struct tree_mod_elem *tm;
1184 struct tree_mod_elem *found = NULL;
1185 u64 root_logical = eb_root->start;
1186 int looped = 0;
1187
1188 if (!time_seq)
1189 return NULL;
1190
1191 /*
1192 * the very last operation that's logged for a root is the
1193 * replacement operation (if it is replaced at all). this has
1194 * the logical address of the *new* root, making it the very
1195 * first operation that's logged for this root.
1196 */
1197 while (1) {
1198 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1199 time_seq);
1200 if (!looped && !tm)
1201 return NULL;
1202 /*
1203 * if there are no tree operation for the oldest root, we simply
1204 * return it. this should only happen if that (old) root is at
1205 * level 0.
1206 */
1207 if (!tm)
1208 break;
1209
1210 /*
1211 * if there's an operation that's not a root replacement, we
1212 * found the oldest version of our root. normally, we'll find a
1213 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1214 */
1215 if (tm->op != MOD_LOG_ROOT_REPLACE)
1216 break;
1217
1218 found = tm;
1219 root_logical = tm->old_root.logical;
1220 looped = 1;
1221 }
1222
1223 /* if there's no old root to return, return what we found instead */
1224 if (!found)
1225 found = tm;
1226
1227 return found;
1228 }
1229
1230 /*
1231 * tm is a pointer to the first operation to rewind within eb. then, all
1232 * previous operations will be rewound (until we reach something older than
1233 * time_seq).
1234 */
1235 static void
__tree_mod_log_rewind(struct btrfs_fs_info * fs_info,struct extent_buffer * eb,u64 time_seq,struct tree_mod_elem * first_tm)1236 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1237 u64 time_seq, struct tree_mod_elem *first_tm)
1238 {
1239 u32 n;
1240 struct rb_node *next;
1241 struct tree_mod_elem *tm = first_tm;
1242 unsigned long o_dst;
1243 unsigned long o_src;
1244 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1245
1246 n = btrfs_header_nritems(eb);
1247 read_lock(&fs_info->tree_mod_log_lock);
1248 while (tm && tm->seq >= time_seq) {
1249 /*
1250 * all the operations are recorded with the operator used for
1251 * the modification. as we're going backwards, we do the
1252 * opposite of each operation here.
1253 */
1254 switch (tm->op) {
1255 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1256 BUG_ON(tm->slot < n);
1257 /* Fallthrough */
1258 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1259 case MOD_LOG_KEY_REMOVE:
1260 btrfs_set_node_key(eb, &tm->key, tm->slot);
1261 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1262 btrfs_set_node_ptr_generation(eb, tm->slot,
1263 tm->generation);
1264 n++;
1265 break;
1266 case MOD_LOG_KEY_REPLACE:
1267 BUG_ON(tm->slot >= n);
1268 btrfs_set_node_key(eb, &tm->key, tm->slot);
1269 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1270 btrfs_set_node_ptr_generation(eb, tm->slot,
1271 tm->generation);
1272 break;
1273 case MOD_LOG_KEY_ADD:
1274 /* if a move operation is needed it's in the log */
1275 n--;
1276 break;
1277 case MOD_LOG_MOVE_KEYS:
1278 o_dst = btrfs_node_key_ptr_offset(tm->slot);
1279 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1280 memmove_extent_buffer(eb, o_dst, o_src,
1281 tm->move.nr_items * p_size);
1282 break;
1283 case MOD_LOG_ROOT_REPLACE:
1284 /*
1285 * this operation is special. for roots, this must be
1286 * handled explicitly before rewinding.
1287 * for non-roots, this operation may exist if the node
1288 * was a root: root A -> child B; then A gets empty and
1289 * B is promoted to the new root. in the mod log, we'll
1290 * have a root-replace operation for B, a tree block
1291 * that is no root. we simply ignore that operation.
1292 */
1293 break;
1294 }
1295 next = rb_next(&tm->node);
1296 if (!next)
1297 break;
1298 tm = rb_entry(next, struct tree_mod_elem, node);
1299 if (tm->logical != first_tm->logical)
1300 break;
1301 }
1302 read_unlock(&fs_info->tree_mod_log_lock);
1303 btrfs_set_header_nritems(eb, n);
1304 }
1305
1306 /*
1307 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1308 * is returned. If rewind operations happen, a fresh buffer is returned. The
1309 * returned buffer is always read-locked. If the returned buffer is not the
1310 * input buffer, the lock on the input buffer is released and the input buffer
1311 * is freed (its refcount is decremented).
1312 */
1313 static struct extent_buffer *
tree_mod_log_rewind(struct btrfs_fs_info * fs_info,struct btrfs_path * path,struct extent_buffer * eb,u64 time_seq)1314 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1315 struct extent_buffer *eb, u64 time_seq)
1316 {
1317 struct extent_buffer *eb_rewin;
1318 struct tree_mod_elem *tm;
1319
1320 if (!time_seq)
1321 return eb;
1322
1323 if (btrfs_header_level(eb) == 0)
1324 return eb;
1325
1326 tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1327 if (!tm)
1328 return eb;
1329
1330 btrfs_set_path_blocking(path);
1331 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1332
1333 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1334 BUG_ON(tm->slot != 0);
1335 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1336 if (!eb_rewin) {
1337 btrfs_tree_read_unlock_blocking(eb);
1338 free_extent_buffer(eb);
1339 return NULL;
1340 }
1341 btrfs_set_header_bytenr(eb_rewin, eb->start);
1342 btrfs_set_header_backref_rev(eb_rewin,
1343 btrfs_header_backref_rev(eb));
1344 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1345 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1346 } else {
1347 eb_rewin = btrfs_clone_extent_buffer(eb);
1348 if (!eb_rewin) {
1349 btrfs_tree_read_unlock_blocking(eb);
1350 free_extent_buffer(eb);
1351 return NULL;
1352 }
1353 }
1354
1355 btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1356 btrfs_tree_read_unlock_blocking(eb);
1357 free_extent_buffer(eb);
1358
1359 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
1360 eb_rewin, btrfs_header_level(eb_rewin));
1361 btrfs_tree_read_lock(eb_rewin);
1362 __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1363 WARN_ON(btrfs_header_nritems(eb_rewin) >
1364 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1365
1366 return eb_rewin;
1367 }
1368
1369 /*
1370 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1371 * value. If there are no changes, the current root->root_node is returned. If
1372 * anything changed in between, there's a fresh buffer allocated on which the
1373 * rewind operations are done. In any case, the returned buffer is read locked.
1374 * Returns NULL on error (with no locks held).
1375 */
1376 static inline struct extent_buffer *
get_old_root(struct btrfs_root * root,u64 time_seq)1377 get_old_root(struct btrfs_root *root, u64 time_seq)
1378 {
1379 struct btrfs_fs_info *fs_info = root->fs_info;
1380 struct tree_mod_elem *tm;
1381 struct extent_buffer *eb = NULL;
1382 struct extent_buffer *eb_root;
1383 u64 eb_root_owner = 0;
1384 struct extent_buffer *old;
1385 struct tree_mod_root *old_root = NULL;
1386 u64 old_generation = 0;
1387 u64 logical;
1388 int level;
1389
1390 eb_root = btrfs_read_lock_root_node(root);
1391 tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1392 if (!tm)
1393 return eb_root;
1394
1395 if (tm->op == MOD_LOG_ROOT_REPLACE) {
1396 old_root = &tm->old_root;
1397 old_generation = tm->generation;
1398 logical = old_root->logical;
1399 level = old_root->level;
1400 } else {
1401 logical = eb_root->start;
1402 level = btrfs_header_level(eb_root);
1403 }
1404
1405 tm = tree_mod_log_search(fs_info, logical, time_seq);
1406 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1407 btrfs_tree_read_unlock(eb_root);
1408 free_extent_buffer(eb_root);
1409 old = read_tree_block(fs_info, logical, 0, level, NULL);
1410 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1411 if (!IS_ERR(old))
1412 free_extent_buffer(old);
1413 btrfs_warn(fs_info,
1414 "failed to read tree block %llu from get_old_root",
1415 logical);
1416 } else {
1417 struct tree_mod_elem *tm2;
1418
1419 btrfs_tree_read_lock(old);
1420 eb = btrfs_clone_extent_buffer(old);
1421 /*
1422 * After the lookup for the most recent tree mod operation
1423 * above and before we locked and cloned the extent buffer
1424 * 'old', a new tree mod log operation may have been added.
1425 * So lookup for a more recent one to make sure the number
1426 * of mod log operations we replay is consistent with the
1427 * number of items we have in the cloned extent buffer,
1428 * otherwise we can hit a BUG_ON when rewinding the extent
1429 * buffer.
1430 */
1431 tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1432 btrfs_tree_read_unlock(old);
1433 free_extent_buffer(old);
1434 ASSERT(tm2);
1435 ASSERT(tm2 == tm || tm2->seq > tm->seq);
1436 if (!tm2 || tm2->seq < tm->seq) {
1437 free_extent_buffer(eb);
1438 return NULL;
1439 }
1440 tm = tm2;
1441 }
1442 } else if (old_root) {
1443 eb_root_owner = btrfs_header_owner(eb_root);
1444 btrfs_tree_read_unlock(eb_root);
1445 free_extent_buffer(eb_root);
1446 eb = alloc_dummy_extent_buffer(fs_info, logical);
1447 } else {
1448 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1449 eb = btrfs_clone_extent_buffer(eb_root);
1450 btrfs_tree_read_unlock_blocking(eb_root);
1451 free_extent_buffer(eb_root);
1452 }
1453
1454 if (!eb)
1455 return NULL;
1456 if (old_root) {
1457 btrfs_set_header_bytenr(eb, eb->start);
1458 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1459 btrfs_set_header_owner(eb, eb_root_owner);
1460 btrfs_set_header_level(eb, old_root->level);
1461 btrfs_set_header_generation(eb, old_generation);
1462 }
1463 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1464 btrfs_header_level(eb));
1465 btrfs_tree_read_lock(eb);
1466 if (tm)
1467 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1468 else
1469 WARN_ON(btrfs_header_level(eb) != 0);
1470 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1471
1472 return eb;
1473 }
1474
btrfs_old_root_level(struct btrfs_root * root,u64 time_seq)1475 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1476 {
1477 struct tree_mod_elem *tm;
1478 int level;
1479 struct extent_buffer *eb_root = btrfs_root_node(root);
1480
1481 tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1482 if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1483 level = tm->old_root.level;
1484 } else {
1485 level = btrfs_header_level(eb_root);
1486 }
1487 free_extent_buffer(eb_root);
1488
1489 return level;
1490 }
1491
should_cow_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf)1492 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1493 struct btrfs_root *root,
1494 struct extent_buffer *buf)
1495 {
1496 if (btrfs_is_testing(root->fs_info))
1497 return 0;
1498
1499 /* Ensure we can see the FORCE_COW bit */
1500 smp_mb__before_atomic();
1501
1502 /*
1503 * We do not need to cow a block if
1504 * 1) this block is not created or changed in this transaction;
1505 * 2) this block does not belong to TREE_RELOC tree;
1506 * 3) the root is not forced COW.
1507 *
1508 * What is forced COW:
1509 * when we create snapshot during committing the transaction,
1510 * after we've finished coping src root, we must COW the shared
1511 * block to ensure the metadata consistency.
1512 */
1513 if (btrfs_header_generation(buf) == trans->transid &&
1514 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1515 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1516 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1517 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1518 return 0;
1519 return 1;
1520 }
1521
1522 /*
1523 * cows a single block, see __btrfs_cow_block for the real work.
1524 * This version of it has extra checks so that a block isn't COWed more than
1525 * once per transaction, as long as it hasn't been written yet
1526 */
btrfs_cow_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer * parent,int parent_slot,struct extent_buffer ** cow_ret)1527 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1528 struct btrfs_root *root, struct extent_buffer *buf,
1529 struct extent_buffer *parent, int parent_slot,
1530 struct extent_buffer **cow_ret)
1531 {
1532 struct btrfs_fs_info *fs_info = root->fs_info;
1533 u64 search_start;
1534 int ret;
1535
1536 if (trans->transaction != fs_info->running_transaction)
1537 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1538 trans->transid,
1539 fs_info->running_transaction->transid);
1540
1541 if (trans->transid != fs_info->generation)
1542 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1543 trans->transid, fs_info->generation);
1544
1545 if (!should_cow_block(trans, root, buf)) {
1546 trans->dirty = true;
1547 *cow_ret = buf;
1548 return 0;
1549 }
1550
1551 search_start = buf->start & ~((u64)SZ_1G - 1);
1552
1553 if (parent)
1554 btrfs_set_lock_blocking(parent);
1555 btrfs_set_lock_blocking(buf);
1556
1557 ret = __btrfs_cow_block(trans, root, buf, parent,
1558 parent_slot, cow_ret, search_start, 0);
1559
1560 trace_btrfs_cow_block(root, buf, *cow_ret);
1561
1562 return ret;
1563 }
1564
1565 /*
1566 * helper function for defrag to decide if two blocks pointed to by a
1567 * node are actually close by
1568 */
close_blocks(u64 blocknr,u64 other,u32 blocksize)1569 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1570 {
1571 if (blocknr < other && other - (blocknr + blocksize) < 32768)
1572 return 1;
1573 if (blocknr > other && blocknr - (other + blocksize) < 32768)
1574 return 1;
1575 return 0;
1576 }
1577
1578 /*
1579 * compare two keys in a memcmp fashion
1580 */
comp_keys(const struct btrfs_disk_key * disk,const struct btrfs_key * k2)1581 static int comp_keys(const struct btrfs_disk_key *disk,
1582 const struct btrfs_key *k2)
1583 {
1584 struct btrfs_key k1;
1585
1586 btrfs_disk_key_to_cpu(&k1, disk);
1587
1588 return btrfs_comp_cpu_keys(&k1, k2);
1589 }
1590
1591 /*
1592 * same as comp_keys only with two btrfs_key's
1593 */
btrfs_comp_cpu_keys(const struct btrfs_key * k1,const struct btrfs_key * k2)1594 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1595 {
1596 if (k1->objectid > k2->objectid)
1597 return 1;
1598 if (k1->objectid < k2->objectid)
1599 return -1;
1600 if (k1->type > k2->type)
1601 return 1;
1602 if (k1->type < k2->type)
1603 return -1;
1604 if (k1->offset > k2->offset)
1605 return 1;
1606 if (k1->offset < k2->offset)
1607 return -1;
1608 return 0;
1609 }
1610
1611 /*
1612 * this is used by the defrag code to go through all the
1613 * leaves pointed to by a node and reallocate them so that
1614 * disk order is close to key order
1615 */
btrfs_realloc_node(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * parent,int start_slot,u64 * last_ret,struct btrfs_key * progress)1616 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1617 struct btrfs_root *root, struct extent_buffer *parent,
1618 int start_slot, u64 *last_ret,
1619 struct btrfs_key *progress)
1620 {
1621 struct btrfs_fs_info *fs_info = root->fs_info;
1622 struct extent_buffer *cur;
1623 u64 blocknr;
1624 u64 gen;
1625 u64 search_start = *last_ret;
1626 u64 last_block = 0;
1627 u64 other;
1628 u32 parent_nritems;
1629 int end_slot;
1630 int i;
1631 int err = 0;
1632 int parent_level;
1633 int uptodate;
1634 u32 blocksize;
1635 int progress_passed = 0;
1636 struct btrfs_disk_key disk_key;
1637
1638 parent_level = btrfs_header_level(parent);
1639
1640 WARN_ON(trans->transaction != fs_info->running_transaction);
1641 WARN_ON(trans->transid != fs_info->generation);
1642
1643 parent_nritems = btrfs_header_nritems(parent);
1644 blocksize = fs_info->nodesize;
1645 end_slot = parent_nritems - 1;
1646
1647 if (parent_nritems <= 1)
1648 return 0;
1649
1650 btrfs_set_lock_blocking(parent);
1651
1652 for (i = start_slot; i <= end_slot; i++) {
1653 struct btrfs_key first_key;
1654 int close = 1;
1655
1656 btrfs_node_key(parent, &disk_key, i);
1657 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1658 continue;
1659
1660 progress_passed = 1;
1661 blocknr = btrfs_node_blockptr(parent, i);
1662 gen = btrfs_node_ptr_generation(parent, i);
1663 btrfs_node_key_to_cpu(parent, &first_key, i);
1664 if (last_block == 0)
1665 last_block = blocknr;
1666
1667 if (i > 0) {
1668 other = btrfs_node_blockptr(parent, i - 1);
1669 close = close_blocks(blocknr, other, blocksize);
1670 }
1671 if (!close && i < end_slot) {
1672 other = btrfs_node_blockptr(parent, i + 1);
1673 close = close_blocks(blocknr, other, blocksize);
1674 }
1675 if (close) {
1676 last_block = blocknr;
1677 continue;
1678 }
1679
1680 cur = find_extent_buffer(fs_info, blocknr);
1681 if (cur)
1682 uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1683 else
1684 uptodate = 0;
1685 if (!cur || !uptodate) {
1686 if (!cur) {
1687 cur = read_tree_block(fs_info, blocknr, gen,
1688 parent_level - 1,
1689 &first_key);
1690 if (IS_ERR(cur)) {
1691 return PTR_ERR(cur);
1692 } else if (!extent_buffer_uptodate(cur)) {
1693 free_extent_buffer(cur);
1694 return -EIO;
1695 }
1696 } else if (!uptodate) {
1697 err = btrfs_read_buffer(cur, gen,
1698 parent_level - 1,&first_key);
1699 if (err) {
1700 free_extent_buffer(cur);
1701 return err;
1702 }
1703 }
1704 }
1705 if (search_start == 0)
1706 search_start = last_block;
1707
1708 btrfs_tree_lock(cur);
1709 btrfs_set_lock_blocking(cur);
1710 err = __btrfs_cow_block(trans, root, cur, parent, i,
1711 &cur, search_start,
1712 min(16 * blocksize,
1713 (end_slot - i) * blocksize));
1714 if (err) {
1715 btrfs_tree_unlock(cur);
1716 free_extent_buffer(cur);
1717 break;
1718 }
1719 search_start = cur->start;
1720 last_block = cur->start;
1721 *last_ret = search_start;
1722 btrfs_tree_unlock(cur);
1723 free_extent_buffer(cur);
1724 }
1725 return err;
1726 }
1727
1728 /*
1729 * search for key in the extent_buffer. The items start at offset p,
1730 * and they are item_size apart. There are 'max' items in p.
1731 *
1732 * the slot in the array is returned via slot, and it points to
1733 * the place where you would insert key if it is not found in
1734 * the array.
1735 *
1736 * slot may point to max if the key is bigger than all of the keys
1737 */
generic_bin_search(struct extent_buffer * eb,unsigned long p,int item_size,const struct btrfs_key * key,int max,int * slot)1738 static noinline int generic_bin_search(struct extent_buffer *eb,
1739 unsigned long p, int item_size,
1740 const struct btrfs_key *key,
1741 int max, int *slot)
1742 {
1743 int low = 0;
1744 int high = max;
1745 int mid;
1746 int ret;
1747 struct btrfs_disk_key *tmp = NULL;
1748 struct btrfs_disk_key unaligned;
1749 unsigned long offset;
1750 char *kaddr = NULL;
1751 unsigned long map_start = 0;
1752 unsigned long map_len = 0;
1753 int err;
1754
1755 if (low > high) {
1756 btrfs_err(eb->fs_info,
1757 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1758 __func__, low, high, eb->start,
1759 btrfs_header_owner(eb), btrfs_header_level(eb));
1760 return -EINVAL;
1761 }
1762
1763 while (low < high) {
1764 mid = (low + high) / 2;
1765 offset = p + mid * item_size;
1766
1767 if (!kaddr || offset < map_start ||
1768 (offset + sizeof(struct btrfs_disk_key)) >
1769 map_start + map_len) {
1770
1771 err = map_private_extent_buffer(eb, offset,
1772 sizeof(struct btrfs_disk_key),
1773 &kaddr, &map_start, &map_len);
1774
1775 if (!err) {
1776 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1777 map_start);
1778 } else if (err == 1) {
1779 read_extent_buffer(eb, &unaligned,
1780 offset, sizeof(unaligned));
1781 tmp = &unaligned;
1782 } else {
1783 return err;
1784 }
1785
1786 } else {
1787 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1788 map_start);
1789 }
1790 ret = comp_keys(tmp, key);
1791
1792 if (ret < 0)
1793 low = mid + 1;
1794 else if (ret > 0)
1795 high = mid;
1796 else {
1797 *slot = mid;
1798 return 0;
1799 }
1800 }
1801 *slot = low;
1802 return 1;
1803 }
1804
1805 /*
1806 * simple bin_search frontend that does the right thing for
1807 * leaves vs nodes
1808 */
btrfs_bin_search(struct extent_buffer * eb,const struct btrfs_key * key,int level,int * slot)1809 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1810 int level, int *slot)
1811 {
1812 if (level == 0)
1813 return generic_bin_search(eb,
1814 offsetof(struct btrfs_leaf, items),
1815 sizeof(struct btrfs_item),
1816 key, btrfs_header_nritems(eb),
1817 slot);
1818 else
1819 return generic_bin_search(eb,
1820 offsetof(struct btrfs_node, ptrs),
1821 sizeof(struct btrfs_key_ptr),
1822 key, btrfs_header_nritems(eb),
1823 slot);
1824 }
1825
root_add_used(struct btrfs_root * root,u32 size)1826 static void root_add_used(struct btrfs_root *root, u32 size)
1827 {
1828 spin_lock(&root->accounting_lock);
1829 btrfs_set_root_used(&root->root_item,
1830 btrfs_root_used(&root->root_item) + size);
1831 spin_unlock(&root->accounting_lock);
1832 }
1833
root_sub_used(struct btrfs_root * root,u32 size)1834 static void root_sub_used(struct btrfs_root *root, u32 size)
1835 {
1836 spin_lock(&root->accounting_lock);
1837 btrfs_set_root_used(&root->root_item,
1838 btrfs_root_used(&root->root_item) - size);
1839 spin_unlock(&root->accounting_lock);
1840 }
1841
1842 /* given a node and slot number, this reads the blocks it points to. The
1843 * extent buffer is returned with a reference taken (but unlocked).
1844 */
1845 static noinline struct extent_buffer *
read_node_slot(struct btrfs_fs_info * fs_info,struct extent_buffer * parent,int slot)1846 read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1847 int slot)
1848 {
1849 int level = btrfs_header_level(parent);
1850 struct extent_buffer *eb;
1851 struct btrfs_key first_key;
1852
1853 if (slot < 0 || slot >= btrfs_header_nritems(parent))
1854 return ERR_PTR(-ENOENT);
1855
1856 BUG_ON(level == 0);
1857
1858 btrfs_node_key_to_cpu(parent, &first_key, slot);
1859 eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1860 btrfs_node_ptr_generation(parent, slot),
1861 level - 1, &first_key);
1862 if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1863 free_extent_buffer(eb);
1864 eb = ERR_PTR(-EIO);
1865 }
1866
1867 return eb;
1868 }
1869
1870 /*
1871 * node level balancing, used to make sure nodes are in proper order for
1872 * item deletion. We balance from the top down, so we have to make sure
1873 * that a deletion won't leave an node completely empty later on.
1874 */
balance_level(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)1875 static noinline int balance_level(struct btrfs_trans_handle *trans,
1876 struct btrfs_root *root,
1877 struct btrfs_path *path, int level)
1878 {
1879 struct btrfs_fs_info *fs_info = root->fs_info;
1880 struct extent_buffer *right = NULL;
1881 struct extent_buffer *mid;
1882 struct extent_buffer *left = NULL;
1883 struct extent_buffer *parent = NULL;
1884 int ret = 0;
1885 int wret;
1886 int pslot;
1887 int orig_slot = path->slots[level];
1888 u64 orig_ptr;
1889
1890 if (level == 0)
1891 return 0;
1892
1893 mid = path->nodes[level];
1894
1895 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1896 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1897 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1898
1899 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1900
1901 if (level < BTRFS_MAX_LEVEL - 1) {
1902 parent = path->nodes[level + 1];
1903 pslot = path->slots[level + 1];
1904 }
1905
1906 /*
1907 * deal with the case where there is only one pointer in the root
1908 * by promoting the node below to a root
1909 */
1910 if (!parent) {
1911 struct extent_buffer *child;
1912
1913 if (btrfs_header_nritems(mid) != 1)
1914 return 0;
1915
1916 /* promote the child to a root */
1917 child = read_node_slot(fs_info, mid, 0);
1918 if (IS_ERR(child)) {
1919 ret = PTR_ERR(child);
1920 btrfs_handle_fs_error(fs_info, ret, NULL);
1921 goto enospc;
1922 }
1923
1924 btrfs_tree_lock(child);
1925 btrfs_set_lock_blocking(child);
1926 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1927 if (ret) {
1928 btrfs_tree_unlock(child);
1929 free_extent_buffer(child);
1930 goto enospc;
1931 }
1932
1933 ret = tree_mod_log_insert_root(root->node, child, 1);
1934 BUG_ON(ret < 0);
1935 rcu_assign_pointer(root->node, child);
1936
1937 add_root_to_dirty_list(root);
1938 btrfs_tree_unlock(child);
1939
1940 path->locks[level] = 0;
1941 path->nodes[level] = NULL;
1942 clean_tree_block(fs_info, mid);
1943 btrfs_tree_unlock(mid);
1944 /* once for the path */
1945 free_extent_buffer(mid);
1946
1947 root_sub_used(root, mid->len);
1948 btrfs_free_tree_block(trans, root, mid, 0, 1);
1949 /* once for the root ptr */
1950 free_extent_buffer_stale(mid);
1951 return 0;
1952 }
1953 if (btrfs_header_nritems(mid) >
1954 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1955 return 0;
1956
1957 left = read_node_slot(fs_info, parent, pslot - 1);
1958 if (IS_ERR(left))
1959 left = NULL;
1960
1961 if (left) {
1962 btrfs_tree_lock(left);
1963 btrfs_set_lock_blocking(left);
1964 wret = btrfs_cow_block(trans, root, left,
1965 parent, pslot - 1, &left);
1966 if (wret) {
1967 ret = wret;
1968 goto enospc;
1969 }
1970 }
1971
1972 right = read_node_slot(fs_info, parent, pslot + 1);
1973 if (IS_ERR(right))
1974 right = NULL;
1975
1976 if (right) {
1977 btrfs_tree_lock(right);
1978 btrfs_set_lock_blocking(right);
1979 wret = btrfs_cow_block(trans, root, right,
1980 parent, pslot + 1, &right);
1981 if (wret) {
1982 ret = wret;
1983 goto enospc;
1984 }
1985 }
1986
1987 /* first, try to make some room in the middle buffer */
1988 if (left) {
1989 orig_slot += btrfs_header_nritems(left);
1990 wret = push_node_left(trans, fs_info, left, mid, 1);
1991 if (wret < 0)
1992 ret = wret;
1993 }
1994
1995 /*
1996 * then try to empty the right most buffer into the middle
1997 */
1998 if (right) {
1999 wret = push_node_left(trans, fs_info, mid, right, 1);
2000 if (wret < 0 && wret != -ENOSPC)
2001 ret = wret;
2002 if (btrfs_header_nritems(right) == 0) {
2003 clean_tree_block(fs_info, right);
2004 btrfs_tree_unlock(right);
2005 del_ptr(root, path, level + 1, pslot + 1);
2006 root_sub_used(root, right->len);
2007 btrfs_free_tree_block(trans, root, right, 0, 1);
2008 free_extent_buffer_stale(right);
2009 right = NULL;
2010 } else {
2011 struct btrfs_disk_key right_key;
2012 btrfs_node_key(right, &right_key, 0);
2013 ret = tree_mod_log_insert_key(parent, pslot + 1,
2014 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2015 BUG_ON(ret < 0);
2016 btrfs_set_node_key(parent, &right_key, pslot + 1);
2017 btrfs_mark_buffer_dirty(parent);
2018 }
2019 }
2020 if (btrfs_header_nritems(mid) == 1) {
2021 /*
2022 * we're not allowed to leave a node with one item in the
2023 * tree during a delete. A deletion from lower in the tree
2024 * could try to delete the only pointer in this node.
2025 * So, pull some keys from the left.
2026 * There has to be a left pointer at this point because
2027 * otherwise we would have pulled some pointers from the
2028 * right
2029 */
2030 if (!left) {
2031 ret = -EROFS;
2032 btrfs_handle_fs_error(fs_info, ret, NULL);
2033 goto enospc;
2034 }
2035 wret = balance_node_right(trans, fs_info, mid, left);
2036 if (wret < 0) {
2037 ret = wret;
2038 goto enospc;
2039 }
2040 if (wret == 1) {
2041 wret = push_node_left(trans, fs_info, left, mid, 1);
2042 if (wret < 0)
2043 ret = wret;
2044 }
2045 BUG_ON(wret == 1);
2046 }
2047 if (btrfs_header_nritems(mid) == 0) {
2048 clean_tree_block(fs_info, mid);
2049 btrfs_tree_unlock(mid);
2050 del_ptr(root, path, level + 1, pslot);
2051 root_sub_used(root, mid->len);
2052 btrfs_free_tree_block(trans, root, mid, 0, 1);
2053 free_extent_buffer_stale(mid);
2054 mid = NULL;
2055 } else {
2056 /* update the parent key to reflect our changes */
2057 struct btrfs_disk_key mid_key;
2058 btrfs_node_key(mid, &mid_key, 0);
2059 ret = tree_mod_log_insert_key(parent, pslot,
2060 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2061 BUG_ON(ret < 0);
2062 btrfs_set_node_key(parent, &mid_key, pslot);
2063 btrfs_mark_buffer_dirty(parent);
2064 }
2065
2066 /* update the path */
2067 if (left) {
2068 if (btrfs_header_nritems(left) > orig_slot) {
2069 extent_buffer_get(left);
2070 /* left was locked after cow */
2071 path->nodes[level] = left;
2072 path->slots[level + 1] -= 1;
2073 path->slots[level] = orig_slot;
2074 if (mid) {
2075 btrfs_tree_unlock(mid);
2076 free_extent_buffer(mid);
2077 }
2078 } else {
2079 orig_slot -= btrfs_header_nritems(left);
2080 path->slots[level] = orig_slot;
2081 }
2082 }
2083 /* double check we haven't messed things up */
2084 if (orig_ptr !=
2085 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2086 BUG();
2087 enospc:
2088 if (right) {
2089 btrfs_tree_unlock(right);
2090 free_extent_buffer(right);
2091 }
2092 if (left) {
2093 if (path->nodes[level] != left)
2094 btrfs_tree_unlock(left);
2095 free_extent_buffer(left);
2096 }
2097 return ret;
2098 }
2099
2100 /* Node balancing for insertion. Here we only split or push nodes around
2101 * when they are completely full. This is also done top down, so we
2102 * have to be pessimistic.
2103 */
push_nodes_for_insert(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)2104 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2105 struct btrfs_root *root,
2106 struct btrfs_path *path, int level)
2107 {
2108 struct btrfs_fs_info *fs_info = root->fs_info;
2109 struct extent_buffer *right = NULL;
2110 struct extent_buffer *mid;
2111 struct extent_buffer *left = NULL;
2112 struct extent_buffer *parent = NULL;
2113 int ret = 0;
2114 int wret;
2115 int pslot;
2116 int orig_slot = path->slots[level];
2117
2118 if (level == 0)
2119 return 1;
2120
2121 mid = path->nodes[level];
2122 WARN_ON(btrfs_header_generation(mid) != trans->transid);
2123
2124 if (level < BTRFS_MAX_LEVEL - 1) {
2125 parent = path->nodes[level + 1];
2126 pslot = path->slots[level + 1];
2127 }
2128
2129 if (!parent)
2130 return 1;
2131
2132 left = read_node_slot(fs_info, parent, pslot - 1);
2133 if (IS_ERR(left))
2134 left = NULL;
2135
2136 /* first, try to make some room in the middle buffer */
2137 if (left) {
2138 u32 left_nr;
2139
2140 btrfs_tree_lock(left);
2141 btrfs_set_lock_blocking(left);
2142
2143 left_nr = btrfs_header_nritems(left);
2144 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2145 wret = 1;
2146 } else {
2147 ret = btrfs_cow_block(trans, root, left, parent,
2148 pslot - 1, &left);
2149 if (ret)
2150 wret = 1;
2151 else {
2152 wret = push_node_left(trans, fs_info,
2153 left, mid, 0);
2154 }
2155 }
2156 if (wret < 0)
2157 ret = wret;
2158 if (wret == 0) {
2159 struct btrfs_disk_key disk_key;
2160 orig_slot += left_nr;
2161 btrfs_node_key(mid, &disk_key, 0);
2162 ret = tree_mod_log_insert_key(parent, pslot,
2163 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2164 BUG_ON(ret < 0);
2165 btrfs_set_node_key(parent, &disk_key, pslot);
2166 btrfs_mark_buffer_dirty(parent);
2167 if (btrfs_header_nritems(left) > orig_slot) {
2168 path->nodes[level] = left;
2169 path->slots[level + 1] -= 1;
2170 path->slots[level] = orig_slot;
2171 btrfs_tree_unlock(mid);
2172 free_extent_buffer(mid);
2173 } else {
2174 orig_slot -=
2175 btrfs_header_nritems(left);
2176 path->slots[level] = orig_slot;
2177 btrfs_tree_unlock(left);
2178 free_extent_buffer(left);
2179 }
2180 return 0;
2181 }
2182 btrfs_tree_unlock(left);
2183 free_extent_buffer(left);
2184 }
2185 right = read_node_slot(fs_info, parent, pslot + 1);
2186 if (IS_ERR(right))
2187 right = NULL;
2188
2189 /*
2190 * then try to empty the right most buffer into the middle
2191 */
2192 if (right) {
2193 u32 right_nr;
2194
2195 btrfs_tree_lock(right);
2196 btrfs_set_lock_blocking(right);
2197
2198 right_nr = btrfs_header_nritems(right);
2199 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2200 wret = 1;
2201 } else {
2202 ret = btrfs_cow_block(trans, root, right,
2203 parent, pslot + 1,
2204 &right);
2205 if (ret)
2206 wret = 1;
2207 else {
2208 wret = balance_node_right(trans, fs_info,
2209 right, mid);
2210 }
2211 }
2212 if (wret < 0)
2213 ret = wret;
2214 if (wret == 0) {
2215 struct btrfs_disk_key disk_key;
2216
2217 btrfs_node_key(right, &disk_key, 0);
2218 ret = tree_mod_log_insert_key(parent, pslot + 1,
2219 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2220 BUG_ON(ret < 0);
2221 btrfs_set_node_key(parent, &disk_key, pslot + 1);
2222 btrfs_mark_buffer_dirty(parent);
2223
2224 if (btrfs_header_nritems(mid) <= orig_slot) {
2225 path->nodes[level] = right;
2226 path->slots[level + 1] += 1;
2227 path->slots[level] = orig_slot -
2228 btrfs_header_nritems(mid);
2229 btrfs_tree_unlock(mid);
2230 free_extent_buffer(mid);
2231 } else {
2232 btrfs_tree_unlock(right);
2233 free_extent_buffer(right);
2234 }
2235 return 0;
2236 }
2237 btrfs_tree_unlock(right);
2238 free_extent_buffer(right);
2239 }
2240 return 1;
2241 }
2242
2243 /*
2244 * readahead one full node of leaves, finding things that are close
2245 * to the block in 'slot', and triggering ra on them.
2246 */
reada_for_search(struct btrfs_fs_info * fs_info,struct btrfs_path * path,int level,int slot,u64 objectid)2247 static void reada_for_search(struct btrfs_fs_info *fs_info,
2248 struct btrfs_path *path,
2249 int level, int slot, u64 objectid)
2250 {
2251 struct extent_buffer *node;
2252 struct btrfs_disk_key disk_key;
2253 u32 nritems;
2254 u64 search;
2255 u64 target;
2256 u64 nread = 0;
2257 struct extent_buffer *eb;
2258 u32 nr;
2259 u32 blocksize;
2260 u32 nscan = 0;
2261
2262 if (level != 1)
2263 return;
2264
2265 if (!path->nodes[level])
2266 return;
2267
2268 node = path->nodes[level];
2269
2270 search = btrfs_node_blockptr(node, slot);
2271 blocksize = fs_info->nodesize;
2272 eb = find_extent_buffer(fs_info, search);
2273 if (eb) {
2274 free_extent_buffer(eb);
2275 return;
2276 }
2277
2278 target = search;
2279
2280 nritems = btrfs_header_nritems(node);
2281 nr = slot;
2282
2283 while (1) {
2284 if (path->reada == READA_BACK) {
2285 if (nr == 0)
2286 break;
2287 nr--;
2288 } else if (path->reada == READA_FORWARD) {
2289 nr++;
2290 if (nr >= nritems)
2291 break;
2292 }
2293 if (path->reada == READA_BACK && objectid) {
2294 btrfs_node_key(node, &disk_key, nr);
2295 if (btrfs_disk_key_objectid(&disk_key) != objectid)
2296 break;
2297 }
2298 search = btrfs_node_blockptr(node, nr);
2299 if ((search <= target && target - search <= 65536) ||
2300 (search > target && search - target <= 65536)) {
2301 readahead_tree_block(fs_info, search);
2302 nread += blocksize;
2303 }
2304 nscan++;
2305 if ((nread > 65536 || nscan > 32))
2306 break;
2307 }
2308 }
2309
reada_for_balance(struct btrfs_fs_info * fs_info,struct btrfs_path * path,int level)2310 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2311 struct btrfs_path *path, int level)
2312 {
2313 int slot;
2314 int nritems;
2315 struct extent_buffer *parent;
2316 struct extent_buffer *eb;
2317 u64 gen;
2318 u64 block1 = 0;
2319 u64 block2 = 0;
2320
2321 parent = path->nodes[level + 1];
2322 if (!parent)
2323 return;
2324
2325 nritems = btrfs_header_nritems(parent);
2326 slot = path->slots[level + 1];
2327
2328 if (slot > 0) {
2329 block1 = btrfs_node_blockptr(parent, slot - 1);
2330 gen = btrfs_node_ptr_generation(parent, slot - 1);
2331 eb = find_extent_buffer(fs_info, block1);
2332 /*
2333 * if we get -eagain from btrfs_buffer_uptodate, we
2334 * don't want to return eagain here. That will loop
2335 * forever
2336 */
2337 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2338 block1 = 0;
2339 free_extent_buffer(eb);
2340 }
2341 if (slot + 1 < nritems) {
2342 block2 = btrfs_node_blockptr(parent, slot + 1);
2343 gen = btrfs_node_ptr_generation(parent, slot + 1);
2344 eb = find_extent_buffer(fs_info, block2);
2345 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2346 block2 = 0;
2347 free_extent_buffer(eb);
2348 }
2349
2350 if (block1)
2351 readahead_tree_block(fs_info, block1);
2352 if (block2)
2353 readahead_tree_block(fs_info, block2);
2354 }
2355
2356
2357 /*
2358 * when we walk down the tree, it is usually safe to unlock the higher layers
2359 * in the tree. The exceptions are when our path goes through slot 0, because
2360 * operations on the tree might require changing key pointers higher up in the
2361 * tree.
2362 *
2363 * callers might also have set path->keep_locks, which tells this code to keep
2364 * the lock if the path points to the last slot in the block. This is part of
2365 * walking through the tree, and selecting the next slot in the higher block.
2366 *
2367 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
2368 * if lowest_unlock is 1, level 0 won't be unlocked
2369 */
unlock_up(struct btrfs_path * path,int level,int lowest_unlock,int min_write_lock_level,int * write_lock_level)2370 static noinline void unlock_up(struct btrfs_path *path, int level,
2371 int lowest_unlock, int min_write_lock_level,
2372 int *write_lock_level)
2373 {
2374 int i;
2375 int skip_level = level;
2376 int no_skips = 0;
2377 struct extent_buffer *t;
2378
2379 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2380 if (!path->nodes[i])
2381 break;
2382 if (!path->locks[i])
2383 break;
2384 if (!no_skips && path->slots[i] == 0) {
2385 skip_level = i + 1;
2386 continue;
2387 }
2388 if (!no_skips && path->keep_locks) {
2389 u32 nritems;
2390 t = path->nodes[i];
2391 nritems = btrfs_header_nritems(t);
2392 if (nritems < 1 || path->slots[i] >= nritems - 1) {
2393 skip_level = i + 1;
2394 continue;
2395 }
2396 }
2397 if (skip_level < i && i >= lowest_unlock)
2398 no_skips = 1;
2399
2400 t = path->nodes[i];
2401 if (i >= lowest_unlock && i > skip_level) {
2402 btrfs_tree_unlock_rw(t, path->locks[i]);
2403 path->locks[i] = 0;
2404 if (write_lock_level &&
2405 i > min_write_lock_level &&
2406 i <= *write_lock_level) {
2407 *write_lock_level = i - 1;
2408 }
2409 }
2410 }
2411 }
2412
2413 /*
2414 * This releases any locks held in the path starting at level and
2415 * going all the way up to the root.
2416 *
2417 * btrfs_search_slot will keep the lock held on higher nodes in a few
2418 * corner cases, such as COW of the block at slot zero in the node. This
2419 * ignores those rules, and it should only be called when there are no
2420 * more updates to be done higher up in the tree.
2421 */
btrfs_unlock_up_safe(struct btrfs_path * path,int level)2422 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2423 {
2424 int i;
2425
2426 if (path->keep_locks)
2427 return;
2428
2429 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2430 if (!path->nodes[i])
2431 continue;
2432 if (!path->locks[i])
2433 continue;
2434 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2435 path->locks[i] = 0;
2436 }
2437 }
2438
2439 /*
2440 * helper function for btrfs_search_slot. The goal is to find a block
2441 * in cache without setting the path to blocking. If we find the block
2442 * we return zero and the path is unchanged.
2443 *
2444 * If we can't find the block, we set the path blocking and do some
2445 * reada. -EAGAIN is returned and the search must be repeated.
2446 */
2447 static int
read_block_for_search(struct btrfs_root * root,struct btrfs_path * p,struct extent_buffer ** eb_ret,int level,int slot,const struct btrfs_key * key)2448 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2449 struct extent_buffer **eb_ret, int level, int slot,
2450 const struct btrfs_key *key)
2451 {
2452 struct btrfs_fs_info *fs_info = root->fs_info;
2453 u64 blocknr;
2454 u64 gen;
2455 struct extent_buffer *b = *eb_ret;
2456 struct extent_buffer *tmp;
2457 struct btrfs_key first_key;
2458 int ret;
2459 int parent_level;
2460
2461 blocknr = btrfs_node_blockptr(b, slot);
2462 gen = btrfs_node_ptr_generation(b, slot);
2463 parent_level = btrfs_header_level(b);
2464 btrfs_node_key_to_cpu(b, &first_key, slot);
2465
2466 tmp = find_extent_buffer(fs_info, blocknr);
2467 if (tmp) {
2468 /* first we do an atomic uptodate check */
2469 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2470 /*
2471 * Do extra check for first_key, eb can be stale due to
2472 * being cached, read from scrub, or have multiple
2473 * parents (shared tree blocks).
2474 */
2475 if (btrfs_verify_level_key(fs_info, tmp,
2476 parent_level - 1, &first_key, gen)) {
2477 free_extent_buffer(tmp);
2478 return -EUCLEAN;
2479 }
2480 *eb_ret = tmp;
2481 return 0;
2482 }
2483
2484 /* the pages were up to date, but we failed
2485 * the generation number check. Do a full
2486 * read for the generation number that is correct.
2487 * We must do this without dropping locks so
2488 * we can trust our generation number
2489 */
2490 btrfs_set_path_blocking(p);
2491
2492 /* now we're allowed to do a blocking uptodate check */
2493 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2494 if (!ret) {
2495 *eb_ret = tmp;
2496 return 0;
2497 }
2498 free_extent_buffer(tmp);
2499 btrfs_release_path(p);
2500 return -EIO;
2501 }
2502
2503 /*
2504 * reduce lock contention at high levels
2505 * of the btree by dropping locks before
2506 * we read. Don't release the lock on the current
2507 * level because we need to walk this node to figure
2508 * out which blocks to read.
2509 */
2510 btrfs_unlock_up_safe(p, level + 1);
2511 btrfs_set_path_blocking(p);
2512
2513 if (p->reada != READA_NONE)
2514 reada_for_search(fs_info, p, level, slot, key->objectid);
2515
2516 ret = -EAGAIN;
2517 tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2518 &first_key);
2519 if (!IS_ERR(tmp)) {
2520 /*
2521 * If the read above didn't mark this buffer up to date,
2522 * it will never end up being up to date. Set ret to EIO now
2523 * and give up so that our caller doesn't loop forever
2524 * on our EAGAINs.
2525 */
2526 if (!extent_buffer_uptodate(tmp))
2527 ret = -EIO;
2528 free_extent_buffer(tmp);
2529 } else {
2530 ret = PTR_ERR(tmp);
2531 }
2532
2533 btrfs_release_path(p);
2534 return ret;
2535 }
2536
2537 /*
2538 * helper function for btrfs_search_slot. This does all of the checks
2539 * for node-level blocks and does any balancing required based on
2540 * the ins_len.
2541 *
2542 * If no extra work was required, zero is returned. If we had to
2543 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2544 * start over
2545 */
2546 static int
setup_nodes_for_search(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * p,struct extent_buffer * b,int level,int ins_len,int * write_lock_level)2547 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2548 struct btrfs_root *root, struct btrfs_path *p,
2549 struct extent_buffer *b, int level, int ins_len,
2550 int *write_lock_level)
2551 {
2552 struct btrfs_fs_info *fs_info = root->fs_info;
2553 int ret;
2554
2555 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2556 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2557 int sret;
2558
2559 if (*write_lock_level < level + 1) {
2560 *write_lock_level = level + 1;
2561 btrfs_release_path(p);
2562 goto again;
2563 }
2564
2565 btrfs_set_path_blocking(p);
2566 reada_for_balance(fs_info, p, level);
2567 sret = split_node(trans, root, p, level);
2568 btrfs_clear_path_blocking(p, NULL, 0);
2569
2570 BUG_ON(sret > 0);
2571 if (sret) {
2572 ret = sret;
2573 goto done;
2574 }
2575 b = p->nodes[level];
2576 } else if (ins_len < 0 && btrfs_header_nritems(b) <
2577 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2578 int sret;
2579
2580 if (*write_lock_level < level + 1) {
2581 *write_lock_level = level + 1;
2582 btrfs_release_path(p);
2583 goto again;
2584 }
2585
2586 btrfs_set_path_blocking(p);
2587 reada_for_balance(fs_info, p, level);
2588 sret = balance_level(trans, root, p, level);
2589 btrfs_clear_path_blocking(p, NULL, 0);
2590
2591 if (sret) {
2592 ret = sret;
2593 goto done;
2594 }
2595 b = p->nodes[level];
2596 if (!b) {
2597 btrfs_release_path(p);
2598 goto again;
2599 }
2600 BUG_ON(btrfs_header_nritems(b) == 1);
2601 }
2602 return 0;
2603
2604 again:
2605 ret = -EAGAIN;
2606 done:
2607 return ret;
2608 }
2609
key_search_validate(struct extent_buffer * b,const struct btrfs_key * key,int level)2610 static void key_search_validate(struct extent_buffer *b,
2611 const struct btrfs_key *key,
2612 int level)
2613 {
2614 #ifdef CONFIG_BTRFS_ASSERT
2615 struct btrfs_disk_key disk_key;
2616
2617 btrfs_cpu_key_to_disk(&disk_key, key);
2618
2619 if (level == 0)
2620 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2621 offsetof(struct btrfs_leaf, items[0].key),
2622 sizeof(disk_key)));
2623 else
2624 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2625 offsetof(struct btrfs_node, ptrs[0].key),
2626 sizeof(disk_key)));
2627 #endif
2628 }
2629
key_search(struct extent_buffer * b,const struct btrfs_key * key,int level,int * prev_cmp,int * slot)2630 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2631 int level, int *prev_cmp, int *slot)
2632 {
2633 if (*prev_cmp != 0) {
2634 *prev_cmp = btrfs_bin_search(b, key, level, slot);
2635 return *prev_cmp;
2636 }
2637
2638 key_search_validate(b, key, level);
2639 *slot = 0;
2640
2641 return 0;
2642 }
2643
btrfs_find_item(struct btrfs_root * fs_root,struct btrfs_path * path,u64 iobjectid,u64 ioff,u8 key_type,struct btrfs_key * found_key)2644 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2645 u64 iobjectid, u64 ioff, u8 key_type,
2646 struct btrfs_key *found_key)
2647 {
2648 int ret;
2649 struct btrfs_key key;
2650 struct extent_buffer *eb;
2651
2652 ASSERT(path);
2653 ASSERT(found_key);
2654
2655 key.type = key_type;
2656 key.objectid = iobjectid;
2657 key.offset = ioff;
2658
2659 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2660 if (ret < 0)
2661 return ret;
2662
2663 eb = path->nodes[0];
2664 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2665 ret = btrfs_next_leaf(fs_root, path);
2666 if (ret)
2667 return ret;
2668 eb = path->nodes[0];
2669 }
2670
2671 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2672 if (found_key->type != key.type ||
2673 found_key->objectid != key.objectid)
2674 return 1;
2675
2676 return 0;
2677 }
2678
btrfs_search_slot_get_root(struct btrfs_root * root,struct btrfs_path * p,int write_lock_level)2679 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2680 struct btrfs_path *p,
2681 int write_lock_level)
2682 {
2683 struct btrfs_fs_info *fs_info = root->fs_info;
2684 struct extent_buffer *b;
2685 int root_lock;
2686 int level = 0;
2687
2688 /* We try very hard to do read locks on the root */
2689 root_lock = BTRFS_READ_LOCK;
2690
2691 if (p->search_commit_root) {
2692 /*
2693 * The commit roots are read only so we always do read locks,
2694 * and we always must hold the commit_root_sem when doing
2695 * searches on them, the only exception is send where we don't
2696 * want to block transaction commits for a long time, so
2697 * we need to clone the commit root in order to avoid races
2698 * with transaction commits that create a snapshot of one of
2699 * the roots used by a send operation.
2700 */
2701 if (p->need_commit_sem) {
2702 down_read(&fs_info->commit_root_sem);
2703 b = btrfs_clone_extent_buffer(root->commit_root);
2704 up_read(&fs_info->commit_root_sem);
2705 if (!b)
2706 return ERR_PTR(-ENOMEM);
2707
2708 } else {
2709 b = root->commit_root;
2710 extent_buffer_get(b);
2711 }
2712 level = btrfs_header_level(b);
2713 /*
2714 * Ensure that all callers have set skip_locking when
2715 * p->search_commit_root = 1.
2716 */
2717 ASSERT(p->skip_locking == 1);
2718
2719 goto out;
2720 }
2721
2722 if (p->skip_locking) {
2723 b = btrfs_root_node(root);
2724 level = btrfs_header_level(b);
2725 goto out;
2726 }
2727
2728 /*
2729 * If the level is set to maximum, we can skip trying to get the read
2730 * lock.
2731 */
2732 if (write_lock_level < BTRFS_MAX_LEVEL) {
2733 /*
2734 * We don't know the level of the root node until we actually
2735 * have it read locked
2736 */
2737 b = btrfs_read_lock_root_node(root);
2738 level = btrfs_header_level(b);
2739 if (level > write_lock_level)
2740 goto out;
2741
2742 /* Whoops, must trade for write lock */
2743 btrfs_tree_read_unlock(b);
2744 free_extent_buffer(b);
2745 }
2746
2747 b = btrfs_lock_root_node(root);
2748 root_lock = BTRFS_WRITE_LOCK;
2749
2750 /* The level might have changed, check again */
2751 level = btrfs_header_level(b);
2752
2753 out:
2754 p->nodes[level] = b;
2755 if (!p->skip_locking)
2756 p->locks[level] = root_lock;
2757 /*
2758 * Callers are responsible for dropping b's references.
2759 */
2760 return b;
2761 }
2762
2763
2764 /*
2765 * btrfs_search_slot - look for a key in a tree and perform necessary
2766 * modifications to preserve tree invariants.
2767 *
2768 * @trans: Handle of transaction, used when modifying the tree
2769 * @p: Holds all btree nodes along the search path
2770 * @root: The root node of the tree
2771 * @key: The key we are looking for
2772 * @ins_len: Indicates purpose of search, for inserts it is 1, for
2773 * deletions it's -1. 0 for plain searches
2774 * @cow: boolean should CoW operations be performed. Must always be 1
2775 * when modifying the tree.
2776 *
2777 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2778 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2779 *
2780 * If @key is found, 0 is returned and you can find the item in the leaf level
2781 * of the path (level 0)
2782 *
2783 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2784 * points to the slot where it should be inserted
2785 *
2786 * If an error is encountered while searching the tree a negative error number
2787 * is returned
2788 */
btrfs_search_slot(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct btrfs_key * key,struct btrfs_path * p,int ins_len,int cow)2789 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2790 const struct btrfs_key *key, struct btrfs_path *p,
2791 int ins_len, int cow)
2792 {
2793 struct btrfs_fs_info *fs_info = root->fs_info;
2794 struct extent_buffer *b;
2795 int slot;
2796 int ret;
2797 int err;
2798 int level;
2799 int lowest_unlock = 1;
2800 /* everything at write_lock_level or lower must be write locked */
2801 int write_lock_level = 0;
2802 u8 lowest_level = 0;
2803 int min_write_lock_level;
2804 int prev_cmp;
2805
2806 lowest_level = p->lowest_level;
2807 WARN_ON(lowest_level && ins_len > 0);
2808 WARN_ON(p->nodes[0] != NULL);
2809 BUG_ON(!cow && ins_len);
2810
2811 if (ins_len < 0) {
2812 lowest_unlock = 2;
2813
2814 /* when we are removing items, we might have to go up to level
2815 * two as we update tree pointers Make sure we keep write
2816 * for those levels as well
2817 */
2818 write_lock_level = 2;
2819 } else if (ins_len > 0) {
2820 /*
2821 * for inserting items, make sure we have a write lock on
2822 * level 1 so we can update keys
2823 */
2824 write_lock_level = 1;
2825 }
2826
2827 if (!cow)
2828 write_lock_level = -1;
2829
2830 if (cow && (p->keep_locks || p->lowest_level))
2831 write_lock_level = BTRFS_MAX_LEVEL;
2832
2833 min_write_lock_level = write_lock_level;
2834
2835 again:
2836 prev_cmp = -1;
2837 b = btrfs_search_slot_get_root(root, p, write_lock_level);
2838 if (IS_ERR(b)) {
2839 ret = PTR_ERR(b);
2840 goto done;
2841 }
2842
2843 while (b) {
2844 level = btrfs_header_level(b);
2845
2846 /*
2847 * setup the path here so we can release it under lock
2848 * contention with the cow code
2849 */
2850 if (cow) {
2851 bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2852
2853 /*
2854 * if we don't really need to cow this block
2855 * then we don't want to set the path blocking,
2856 * so we test it here
2857 */
2858 if (!should_cow_block(trans, root, b)) {
2859 trans->dirty = true;
2860 goto cow_done;
2861 }
2862
2863 /*
2864 * must have write locks on this node and the
2865 * parent
2866 */
2867 if (level > write_lock_level ||
2868 (level + 1 > write_lock_level &&
2869 level + 1 < BTRFS_MAX_LEVEL &&
2870 p->nodes[level + 1])) {
2871 write_lock_level = level + 1;
2872 btrfs_release_path(p);
2873 goto again;
2874 }
2875
2876 btrfs_set_path_blocking(p);
2877 if (last_level)
2878 err = btrfs_cow_block(trans, root, b, NULL, 0,
2879 &b);
2880 else
2881 err = btrfs_cow_block(trans, root, b,
2882 p->nodes[level + 1],
2883 p->slots[level + 1], &b);
2884 if (err) {
2885 ret = err;
2886 goto done;
2887 }
2888 }
2889 cow_done:
2890 p->nodes[level] = b;
2891 btrfs_clear_path_blocking(p, NULL, 0);
2892
2893 /*
2894 * we have a lock on b and as long as we aren't changing
2895 * the tree, there is no way to for the items in b to change.
2896 * It is safe to drop the lock on our parent before we
2897 * go through the expensive btree search on b.
2898 *
2899 * If we're inserting or deleting (ins_len != 0), then we might
2900 * be changing slot zero, which may require changing the parent.
2901 * So, we can't drop the lock until after we know which slot
2902 * we're operating on.
2903 */
2904 if (!ins_len && !p->keep_locks) {
2905 int u = level + 1;
2906
2907 if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2908 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2909 p->locks[u] = 0;
2910 }
2911 }
2912
2913 ret = key_search(b, key, level, &prev_cmp, &slot);
2914 if (ret < 0)
2915 goto done;
2916
2917 if (level != 0) {
2918 int dec = 0;
2919 if (ret && slot > 0) {
2920 dec = 1;
2921 slot -= 1;
2922 }
2923 p->slots[level] = slot;
2924 err = setup_nodes_for_search(trans, root, p, b, level,
2925 ins_len, &write_lock_level);
2926 if (err == -EAGAIN)
2927 goto again;
2928 if (err) {
2929 ret = err;
2930 goto done;
2931 }
2932 b = p->nodes[level];
2933 slot = p->slots[level];
2934
2935 /*
2936 * slot 0 is special, if we change the key
2937 * we have to update the parent pointer
2938 * which means we must have a write lock
2939 * on the parent
2940 */
2941 if (slot == 0 && ins_len &&
2942 write_lock_level < level + 1) {
2943 write_lock_level = level + 1;
2944 btrfs_release_path(p);
2945 goto again;
2946 }
2947
2948 unlock_up(p, level, lowest_unlock,
2949 min_write_lock_level, &write_lock_level);
2950
2951 if (level == lowest_level) {
2952 if (dec)
2953 p->slots[level]++;
2954 goto done;
2955 }
2956
2957 err = read_block_for_search(root, p, &b, level,
2958 slot, key);
2959 if (err == -EAGAIN)
2960 goto again;
2961 if (err) {
2962 ret = err;
2963 goto done;
2964 }
2965
2966 if (!p->skip_locking) {
2967 level = btrfs_header_level(b);
2968 if (level <= write_lock_level) {
2969 err = btrfs_try_tree_write_lock(b);
2970 if (!err) {
2971 btrfs_set_path_blocking(p);
2972 btrfs_tree_lock(b);
2973 btrfs_clear_path_blocking(p, b,
2974 BTRFS_WRITE_LOCK);
2975 }
2976 p->locks[level] = BTRFS_WRITE_LOCK;
2977 } else {
2978 err = btrfs_tree_read_lock_atomic(b);
2979 if (!err) {
2980 btrfs_set_path_blocking(p);
2981 btrfs_tree_read_lock(b);
2982 btrfs_clear_path_blocking(p, b,
2983 BTRFS_READ_LOCK);
2984 }
2985 p->locks[level] = BTRFS_READ_LOCK;
2986 }
2987 p->nodes[level] = b;
2988 }
2989 } else {
2990 p->slots[level] = slot;
2991 if (ins_len > 0 &&
2992 btrfs_leaf_free_space(fs_info, b) < ins_len) {
2993 if (write_lock_level < 1) {
2994 write_lock_level = 1;
2995 btrfs_release_path(p);
2996 goto again;
2997 }
2998
2999 btrfs_set_path_blocking(p);
3000 err = split_leaf(trans, root, key,
3001 p, ins_len, ret == 0);
3002 btrfs_clear_path_blocking(p, NULL, 0);
3003
3004 BUG_ON(err > 0);
3005 if (err) {
3006 ret = err;
3007 goto done;
3008 }
3009 }
3010 if (!p->search_for_split)
3011 unlock_up(p, level, lowest_unlock,
3012 min_write_lock_level, &write_lock_level);
3013 goto done;
3014 }
3015 }
3016 ret = 1;
3017 done:
3018 /*
3019 * we don't really know what they plan on doing with the path
3020 * from here on, so for now just mark it as blocking
3021 */
3022 if (!p->leave_spinning)
3023 btrfs_set_path_blocking(p);
3024 if (ret < 0 && !p->skip_release_on_error)
3025 btrfs_release_path(p);
3026 return ret;
3027 }
3028
3029 /*
3030 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
3031 * current state of the tree together with the operations recorded in the tree
3032 * modification log to search for the key in a previous version of this tree, as
3033 * denoted by the time_seq parameter.
3034 *
3035 * Naturally, there is no support for insert, delete or cow operations.
3036 *
3037 * The resulting path and return value will be set up as if we called
3038 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
3039 */
btrfs_search_old_slot(struct btrfs_root * root,const struct btrfs_key * key,struct btrfs_path * p,u64 time_seq)3040 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
3041 struct btrfs_path *p, u64 time_seq)
3042 {
3043 struct btrfs_fs_info *fs_info = root->fs_info;
3044 struct extent_buffer *b;
3045 int slot;
3046 int ret;
3047 int err;
3048 int level;
3049 int lowest_unlock = 1;
3050 u8 lowest_level = 0;
3051 int prev_cmp = -1;
3052
3053 lowest_level = p->lowest_level;
3054 WARN_ON(p->nodes[0] != NULL);
3055
3056 if (p->search_commit_root) {
3057 BUG_ON(time_seq);
3058 return btrfs_search_slot(NULL, root, key, p, 0, 0);
3059 }
3060
3061 again:
3062 b = get_old_root(root, time_seq);
3063 if (!b) {
3064 ret = -EIO;
3065 goto done;
3066 }
3067 level = btrfs_header_level(b);
3068 p->locks[level] = BTRFS_READ_LOCK;
3069
3070 while (b) {
3071 level = btrfs_header_level(b);
3072 p->nodes[level] = b;
3073 btrfs_clear_path_blocking(p, NULL, 0);
3074
3075 /*
3076 * we have a lock on b and as long as we aren't changing
3077 * the tree, there is no way to for the items in b to change.
3078 * It is safe to drop the lock on our parent before we
3079 * go through the expensive btree search on b.
3080 */
3081 btrfs_unlock_up_safe(p, level + 1);
3082
3083 /*
3084 * Since we can unwind ebs we want to do a real search every
3085 * time.
3086 */
3087 prev_cmp = -1;
3088 ret = key_search(b, key, level, &prev_cmp, &slot);
3089
3090 if (level != 0) {
3091 int dec = 0;
3092 if (ret && slot > 0) {
3093 dec = 1;
3094 slot -= 1;
3095 }
3096 p->slots[level] = slot;
3097 unlock_up(p, level, lowest_unlock, 0, NULL);
3098
3099 if (level == lowest_level) {
3100 if (dec)
3101 p->slots[level]++;
3102 goto done;
3103 }
3104
3105 err = read_block_for_search(root, p, &b, level,
3106 slot, key);
3107 if (err == -EAGAIN)
3108 goto again;
3109 if (err) {
3110 ret = err;
3111 goto done;
3112 }
3113
3114 level = btrfs_header_level(b);
3115 err = btrfs_tree_read_lock_atomic(b);
3116 if (!err) {
3117 btrfs_set_path_blocking(p);
3118 btrfs_tree_read_lock(b);
3119 btrfs_clear_path_blocking(p, b,
3120 BTRFS_READ_LOCK);
3121 }
3122 b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3123 if (!b) {
3124 ret = -ENOMEM;
3125 goto done;
3126 }
3127 p->locks[level] = BTRFS_READ_LOCK;
3128 p->nodes[level] = b;
3129 } else {
3130 p->slots[level] = slot;
3131 unlock_up(p, level, lowest_unlock, 0, NULL);
3132 goto done;
3133 }
3134 }
3135 ret = 1;
3136 done:
3137 if (!p->leave_spinning)
3138 btrfs_set_path_blocking(p);
3139 if (ret < 0)
3140 btrfs_release_path(p);
3141
3142 return ret;
3143 }
3144
3145 /*
3146 * helper to use instead of search slot if no exact match is needed but
3147 * instead the next or previous item should be returned.
3148 * When find_higher is true, the next higher item is returned, the next lower
3149 * otherwise.
3150 * When return_any and find_higher are both true, and no higher item is found,
3151 * return the next lower instead.
3152 * When return_any is true and find_higher is false, and no lower item is found,
3153 * return the next higher instead.
3154 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3155 * < 0 on error
3156 */
btrfs_search_slot_for_read(struct btrfs_root * root,const struct btrfs_key * key,struct btrfs_path * p,int find_higher,int return_any)3157 int btrfs_search_slot_for_read(struct btrfs_root *root,
3158 const struct btrfs_key *key,
3159 struct btrfs_path *p, int find_higher,
3160 int return_any)
3161 {
3162 int ret;
3163 struct extent_buffer *leaf;
3164
3165 again:
3166 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3167 if (ret <= 0)
3168 return ret;
3169 /*
3170 * a return value of 1 means the path is at the position where the
3171 * item should be inserted. Normally this is the next bigger item,
3172 * but in case the previous item is the last in a leaf, path points
3173 * to the first free slot in the previous leaf, i.e. at an invalid
3174 * item.
3175 */
3176 leaf = p->nodes[0];
3177
3178 if (find_higher) {
3179 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3180 ret = btrfs_next_leaf(root, p);
3181 if (ret <= 0)
3182 return ret;
3183 if (!return_any)
3184 return 1;
3185 /*
3186 * no higher item found, return the next
3187 * lower instead
3188 */
3189 return_any = 0;
3190 find_higher = 0;
3191 btrfs_release_path(p);
3192 goto again;
3193 }
3194 } else {
3195 if (p->slots[0] == 0) {
3196 ret = btrfs_prev_leaf(root, p);
3197 if (ret < 0)
3198 return ret;
3199 if (!ret) {
3200 leaf = p->nodes[0];
3201 if (p->slots[0] == btrfs_header_nritems(leaf))
3202 p->slots[0]--;
3203 return 0;
3204 }
3205 if (!return_any)
3206 return 1;
3207 /*
3208 * no lower item found, return the next
3209 * higher instead
3210 */
3211 return_any = 0;
3212 find_higher = 1;
3213 btrfs_release_path(p);
3214 goto again;
3215 } else {
3216 --p->slots[0];
3217 }
3218 }
3219 return 0;
3220 }
3221
3222 /*
3223 * adjust the pointers going up the tree, starting at level
3224 * making sure the right key of each node is points to 'key'.
3225 * This is used after shifting pointers to the left, so it stops
3226 * fixing up pointers when a given leaf/node is not in slot 0 of the
3227 * higher levels
3228 *
3229 */
fixup_low_keys(struct btrfs_path * path,struct btrfs_disk_key * key,int level)3230 static void fixup_low_keys(struct btrfs_path *path,
3231 struct btrfs_disk_key *key, int level)
3232 {
3233 int i;
3234 struct extent_buffer *t;
3235 int ret;
3236
3237 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3238 int tslot = path->slots[i];
3239
3240 if (!path->nodes[i])
3241 break;
3242 t = path->nodes[i];
3243 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3244 GFP_ATOMIC);
3245 BUG_ON(ret < 0);
3246 btrfs_set_node_key(t, key, tslot);
3247 btrfs_mark_buffer_dirty(path->nodes[i]);
3248 if (tslot != 0)
3249 break;
3250 }
3251 }
3252
3253 /*
3254 * update item key.
3255 *
3256 * This function isn't completely safe. It's the caller's responsibility
3257 * that the new key won't break the order
3258 */
btrfs_set_item_key_safe(struct btrfs_fs_info * fs_info,struct btrfs_path * path,const struct btrfs_key * new_key)3259 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3260 struct btrfs_path *path,
3261 const struct btrfs_key *new_key)
3262 {
3263 struct btrfs_disk_key disk_key;
3264 struct extent_buffer *eb;
3265 int slot;
3266
3267 eb = path->nodes[0];
3268 slot = path->slots[0];
3269 if (slot > 0) {
3270 btrfs_item_key(eb, &disk_key, slot - 1);
3271 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3272 }
3273 if (slot < btrfs_header_nritems(eb) - 1) {
3274 btrfs_item_key(eb, &disk_key, slot + 1);
3275 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3276 }
3277
3278 btrfs_cpu_key_to_disk(&disk_key, new_key);
3279 btrfs_set_item_key(eb, &disk_key, slot);
3280 btrfs_mark_buffer_dirty(eb);
3281 if (slot == 0)
3282 fixup_low_keys(path, &disk_key, 1);
3283 }
3284
3285 /*
3286 * try to push data from one node into the next node left in the
3287 * tree.
3288 *
3289 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3290 * error, and > 0 if there was no room in the left hand block.
3291 */
push_node_left(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info,struct extent_buffer * dst,struct extent_buffer * src,int empty)3292 static int push_node_left(struct btrfs_trans_handle *trans,
3293 struct btrfs_fs_info *fs_info,
3294 struct extent_buffer *dst,
3295 struct extent_buffer *src, int empty)
3296 {
3297 int push_items = 0;
3298 int src_nritems;
3299 int dst_nritems;
3300 int ret = 0;
3301
3302 src_nritems = btrfs_header_nritems(src);
3303 dst_nritems = btrfs_header_nritems(dst);
3304 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3305 WARN_ON(btrfs_header_generation(src) != trans->transid);
3306 WARN_ON(btrfs_header_generation(dst) != trans->transid);
3307
3308 if (!empty && src_nritems <= 8)
3309 return 1;
3310
3311 if (push_items <= 0)
3312 return 1;
3313
3314 if (empty) {
3315 push_items = min(src_nritems, push_items);
3316 if (push_items < src_nritems) {
3317 /* leave at least 8 pointers in the node if
3318 * we aren't going to empty it
3319 */
3320 if (src_nritems - push_items < 8) {
3321 if (push_items <= 8)
3322 return 1;
3323 push_items -= 8;
3324 }
3325 }
3326 } else
3327 push_items = min(src_nritems - 8, push_items);
3328
3329 ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3330 push_items);
3331 if (ret) {
3332 btrfs_abort_transaction(trans, ret);
3333 return ret;
3334 }
3335 copy_extent_buffer(dst, src,
3336 btrfs_node_key_ptr_offset(dst_nritems),
3337 btrfs_node_key_ptr_offset(0),
3338 push_items * sizeof(struct btrfs_key_ptr));
3339
3340 if (push_items < src_nritems) {
3341 /*
3342 * Don't call tree_mod_log_insert_move here, key removal was
3343 * already fully logged by tree_mod_log_eb_copy above.
3344 */
3345 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3346 btrfs_node_key_ptr_offset(push_items),
3347 (src_nritems - push_items) *
3348 sizeof(struct btrfs_key_ptr));
3349 }
3350 btrfs_set_header_nritems(src, src_nritems - push_items);
3351 btrfs_set_header_nritems(dst, dst_nritems + push_items);
3352 btrfs_mark_buffer_dirty(src);
3353 btrfs_mark_buffer_dirty(dst);
3354
3355 return ret;
3356 }
3357
3358 /*
3359 * try to push data from one node into the next node right in the
3360 * tree.
3361 *
3362 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3363 * error, and > 0 if there was no room in the right hand block.
3364 *
3365 * this will only push up to 1/2 the contents of the left node over
3366 */
balance_node_right(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info,struct extent_buffer * dst,struct extent_buffer * src)3367 static int balance_node_right(struct btrfs_trans_handle *trans,
3368 struct btrfs_fs_info *fs_info,
3369 struct extent_buffer *dst,
3370 struct extent_buffer *src)
3371 {
3372 int push_items = 0;
3373 int max_push;
3374 int src_nritems;
3375 int dst_nritems;
3376 int ret = 0;
3377
3378 WARN_ON(btrfs_header_generation(src) != trans->transid);
3379 WARN_ON(btrfs_header_generation(dst) != trans->transid);
3380
3381 src_nritems = btrfs_header_nritems(src);
3382 dst_nritems = btrfs_header_nritems(dst);
3383 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3384 if (push_items <= 0)
3385 return 1;
3386
3387 if (src_nritems < 4)
3388 return 1;
3389
3390 max_push = src_nritems / 2 + 1;
3391 /* don't try to empty the node */
3392 if (max_push >= src_nritems)
3393 return 1;
3394
3395 if (max_push < push_items)
3396 push_items = max_push;
3397
3398 ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3399 BUG_ON(ret < 0);
3400 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3401 btrfs_node_key_ptr_offset(0),
3402 (dst_nritems) *
3403 sizeof(struct btrfs_key_ptr));
3404
3405 ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3406 src_nritems - push_items, push_items);
3407 if (ret) {
3408 btrfs_abort_transaction(trans, ret);
3409 return ret;
3410 }
3411 copy_extent_buffer(dst, src,
3412 btrfs_node_key_ptr_offset(0),
3413 btrfs_node_key_ptr_offset(src_nritems - push_items),
3414 push_items * sizeof(struct btrfs_key_ptr));
3415
3416 btrfs_set_header_nritems(src, src_nritems - push_items);
3417 btrfs_set_header_nritems(dst, dst_nritems + push_items);
3418
3419 btrfs_mark_buffer_dirty(src);
3420 btrfs_mark_buffer_dirty(dst);
3421
3422 return ret;
3423 }
3424
3425 /*
3426 * helper function to insert a new root level in the tree.
3427 * A new node is allocated, and a single item is inserted to
3428 * point to the existing root
3429 *
3430 * returns zero on success or < 0 on failure.
3431 */
insert_new_root(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)3432 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3433 struct btrfs_root *root,
3434 struct btrfs_path *path, int level)
3435 {
3436 struct btrfs_fs_info *fs_info = root->fs_info;
3437 u64 lower_gen;
3438 struct extent_buffer *lower;
3439 struct extent_buffer *c;
3440 struct extent_buffer *old;
3441 struct btrfs_disk_key lower_key;
3442 int ret;
3443
3444 BUG_ON(path->nodes[level]);
3445 BUG_ON(path->nodes[level-1] != root->node);
3446
3447 lower = path->nodes[level-1];
3448 if (level == 1)
3449 btrfs_item_key(lower, &lower_key, 0);
3450 else
3451 btrfs_node_key(lower, &lower_key, 0);
3452
3453 c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3454 root->node->start, 0);
3455 if (IS_ERR(c))
3456 return PTR_ERR(c);
3457
3458 root_add_used(root, fs_info->nodesize);
3459
3460 btrfs_set_header_nritems(c, 1);
3461 btrfs_set_node_key(c, &lower_key, 0);
3462 btrfs_set_node_blockptr(c, 0, lower->start);
3463 lower_gen = btrfs_header_generation(lower);
3464 WARN_ON(lower_gen != trans->transid);
3465
3466 btrfs_set_node_ptr_generation(c, 0, lower_gen);
3467
3468 btrfs_mark_buffer_dirty(c);
3469
3470 old = root->node;
3471 ret = tree_mod_log_insert_root(root->node, c, 0);
3472 BUG_ON(ret < 0);
3473 rcu_assign_pointer(root->node, c);
3474
3475 /* the super has an extra ref to root->node */
3476 free_extent_buffer(old);
3477
3478 add_root_to_dirty_list(root);
3479 extent_buffer_get(c);
3480 path->nodes[level] = c;
3481 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3482 path->slots[level] = 0;
3483 return 0;
3484 }
3485
3486 /*
3487 * worker function to insert a single pointer in a node.
3488 * the node should have enough room for the pointer already
3489 *
3490 * slot and level indicate where you want the key to go, and
3491 * blocknr is the block the key points to.
3492 */
insert_ptr(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info,struct btrfs_path * path,struct btrfs_disk_key * key,u64 bytenr,int slot,int level)3493 static void insert_ptr(struct btrfs_trans_handle *trans,
3494 struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3495 struct btrfs_disk_key *key, u64 bytenr,
3496 int slot, int level)
3497 {
3498 struct extent_buffer *lower;
3499 int nritems;
3500 int ret;
3501
3502 BUG_ON(!path->nodes[level]);
3503 btrfs_assert_tree_locked(path->nodes[level]);
3504 lower = path->nodes[level];
3505 nritems = btrfs_header_nritems(lower);
3506 BUG_ON(slot > nritems);
3507 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3508 if (slot != nritems) {
3509 if (level) {
3510 ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3511 nritems - slot);
3512 BUG_ON(ret < 0);
3513 }
3514 memmove_extent_buffer(lower,
3515 btrfs_node_key_ptr_offset(slot + 1),
3516 btrfs_node_key_ptr_offset(slot),
3517 (nritems - slot) * sizeof(struct btrfs_key_ptr));
3518 }
3519 if (level) {
3520 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3521 GFP_NOFS);
3522 BUG_ON(ret < 0);
3523 }
3524 btrfs_set_node_key(lower, key, slot);
3525 btrfs_set_node_blockptr(lower, slot, bytenr);
3526 WARN_ON(trans->transid == 0);
3527 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3528 btrfs_set_header_nritems(lower, nritems + 1);
3529 btrfs_mark_buffer_dirty(lower);
3530 }
3531
3532 /*
3533 * split the node at the specified level in path in two.
3534 * The path is corrected to point to the appropriate node after the split
3535 *
3536 * Before splitting this tries to make some room in the node by pushing
3537 * left and right, if either one works, it returns right away.
3538 *
3539 * returns 0 on success and < 0 on failure
3540 */
split_node(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)3541 static noinline int split_node(struct btrfs_trans_handle *trans,
3542 struct btrfs_root *root,
3543 struct btrfs_path *path, int level)
3544 {
3545 struct btrfs_fs_info *fs_info = root->fs_info;
3546 struct extent_buffer *c;
3547 struct extent_buffer *split;
3548 struct btrfs_disk_key disk_key;
3549 int mid;
3550 int ret;
3551 u32 c_nritems;
3552
3553 c = path->nodes[level];
3554 WARN_ON(btrfs_header_generation(c) != trans->transid);
3555 if (c == root->node) {
3556 /*
3557 * trying to split the root, lets make a new one
3558 *
3559 * tree mod log: We don't log_removal old root in
3560 * insert_new_root, because that root buffer will be kept as a
3561 * normal node. We are going to log removal of half of the
3562 * elements below with tree_mod_log_eb_copy. We're holding a
3563 * tree lock on the buffer, which is why we cannot race with
3564 * other tree_mod_log users.
3565 */
3566 ret = insert_new_root(trans, root, path, level + 1);
3567 if (ret)
3568 return ret;
3569 } else {
3570 ret = push_nodes_for_insert(trans, root, path, level);
3571 c = path->nodes[level];
3572 if (!ret && btrfs_header_nritems(c) <
3573 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3574 return 0;
3575 if (ret < 0)
3576 return ret;
3577 }
3578
3579 c_nritems = btrfs_header_nritems(c);
3580 mid = (c_nritems + 1) / 2;
3581 btrfs_node_key(c, &disk_key, mid);
3582
3583 split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3584 c->start, 0);
3585 if (IS_ERR(split))
3586 return PTR_ERR(split);
3587
3588 root_add_used(root, fs_info->nodesize);
3589 ASSERT(btrfs_header_level(c) == level);
3590
3591 ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3592 if (ret) {
3593 btrfs_tree_unlock(split);
3594 free_extent_buffer(split);
3595 btrfs_abort_transaction(trans, ret);
3596 return ret;
3597 }
3598 copy_extent_buffer(split, c,
3599 btrfs_node_key_ptr_offset(0),
3600 btrfs_node_key_ptr_offset(mid),
3601 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3602 btrfs_set_header_nritems(split, c_nritems - mid);
3603 btrfs_set_header_nritems(c, mid);
3604 ret = 0;
3605
3606 btrfs_mark_buffer_dirty(c);
3607 btrfs_mark_buffer_dirty(split);
3608
3609 insert_ptr(trans, fs_info, path, &disk_key, split->start,
3610 path->slots[level + 1] + 1, level + 1);
3611
3612 if (path->slots[level] >= mid) {
3613 path->slots[level] -= mid;
3614 btrfs_tree_unlock(c);
3615 free_extent_buffer(c);
3616 path->nodes[level] = split;
3617 path->slots[level + 1] += 1;
3618 } else {
3619 btrfs_tree_unlock(split);
3620 free_extent_buffer(split);
3621 }
3622 return ret;
3623 }
3624
3625 /*
3626 * how many bytes are required to store the items in a leaf. start
3627 * and nr indicate which items in the leaf to check. This totals up the
3628 * space used both by the item structs and the item data
3629 */
leaf_space_used(struct extent_buffer * l,int start,int nr)3630 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3631 {
3632 struct btrfs_item *start_item;
3633 struct btrfs_item *end_item;
3634 struct btrfs_map_token token;
3635 int data_len;
3636 int nritems = btrfs_header_nritems(l);
3637 int end = min(nritems, start + nr) - 1;
3638
3639 if (!nr)
3640 return 0;
3641 btrfs_init_map_token(&token);
3642 start_item = btrfs_item_nr(start);
3643 end_item = btrfs_item_nr(end);
3644 data_len = btrfs_token_item_offset(l, start_item, &token) +
3645 btrfs_token_item_size(l, start_item, &token);
3646 data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3647 data_len += sizeof(struct btrfs_item) * nr;
3648 WARN_ON(data_len < 0);
3649 return data_len;
3650 }
3651
3652 /*
3653 * The space between the end of the leaf items and
3654 * the start of the leaf data. IOW, how much room
3655 * the leaf has left for both items and data
3656 */
btrfs_leaf_free_space(struct btrfs_fs_info * fs_info,struct extent_buffer * leaf)3657 noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3658 struct extent_buffer *leaf)
3659 {
3660 int nritems = btrfs_header_nritems(leaf);
3661 int ret;
3662
3663 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3664 if (ret < 0) {
3665 btrfs_crit(fs_info,
3666 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3667 ret,
3668 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3669 leaf_space_used(leaf, 0, nritems), nritems);
3670 }
3671 return ret;
3672 }
3673
3674 /*
3675 * min slot controls the lowest index we're willing to push to the
3676 * right. We'll push up to and including min_slot, but no lower
3677 */
__push_leaf_right(struct btrfs_fs_info * fs_info,struct btrfs_path * path,int data_size,int empty,struct extent_buffer * right,int free_space,u32 left_nritems,u32 min_slot)3678 static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3679 struct btrfs_path *path,
3680 int data_size, int empty,
3681 struct extent_buffer *right,
3682 int free_space, u32 left_nritems,
3683 u32 min_slot)
3684 {
3685 struct extent_buffer *left = path->nodes[0];
3686 struct extent_buffer *upper = path->nodes[1];
3687 struct btrfs_map_token token;
3688 struct btrfs_disk_key disk_key;
3689 int slot;
3690 u32 i;
3691 int push_space = 0;
3692 int push_items = 0;
3693 struct btrfs_item *item;
3694 u32 nr;
3695 u32 right_nritems;
3696 u32 data_end;
3697 u32 this_item_size;
3698
3699 btrfs_init_map_token(&token);
3700
3701 if (empty)
3702 nr = 0;
3703 else
3704 nr = max_t(u32, 1, min_slot);
3705
3706 if (path->slots[0] >= left_nritems)
3707 push_space += data_size;
3708
3709 slot = path->slots[1];
3710 i = left_nritems - 1;
3711 while (i >= nr) {
3712 item = btrfs_item_nr(i);
3713
3714 if (!empty && push_items > 0) {
3715 if (path->slots[0] > i)
3716 break;
3717 if (path->slots[0] == i) {
3718 int space = btrfs_leaf_free_space(fs_info, left);
3719 if (space + push_space * 2 > free_space)
3720 break;
3721 }
3722 }
3723
3724 if (path->slots[0] == i)
3725 push_space += data_size;
3726
3727 this_item_size = btrfs_item_size(left, item);
3728 if (this_item_size + sizeof(*item) + push_space > free_space)
3729 break;
3730
3731 push_items++;
3732 push_space += this_item_size + sizeof(*item);
3733 if (i == 0)
3734 break;
3735 i--;
3736 }
3737
3738 if (push_items == 0)
3739 goto out_unlock;
3740
3741 WARN_ON(!empty && push_items == left_nritems);
3742
3743 /* push left to right */
3744 right_nritems = btrfs_header_nritems(right);
3745
3746 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3747 push_space -= leaf_data_end(fs_info, left);
3748
3749 /* make room in the right data area */
3750 data_end = leaf_data_end(fs_info, right);
3751 memmove_extent_buffer(right,
3752 BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3753 BTRFS_LEAF_DATA_OFFSET + data_end,
3754 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3755
3756 /* copy from the left data area */
3757 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3758 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3759 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3760 push_space);
3761
3762 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3763 btrfs_item_nr_offset(0),
3764 right_nritems * sizeof(struct btrfs_item));
3765
3766 /* copy the items from left to right */
3767 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3768 btrfs_item_nr_offset(left_nritems - push_items),
3769 push_items * sizeof(struct btrfs_item));
3770
3771 /* update the item pointers */
3772 right_nritems += push_items;
3773 btrfs_set_header_nritems(right, right_nritems);
3774 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3775 for (i = 0; i < right_nritems; i++) {
3776 item = btrfs_item_nr(i);
3777 push_space -= btrfs_token_item_size(right, item, &token);
3778 btrfs_set_token_item_offset(right, item, push_space, &token);
3779 }
3780
3781 left_nritems -= push_items;
3782 btrfs_set_header_nritems(left, left_nritems);
3783
3784 if (left_nritems)
3785 btrfs_mark_buffer_dirty(left);
3786 else
3787 clean_tree_block(fs_info, left);
3788
3789 btrfs_mark_buffer_dirty(right);
3790
3791 btrfs_item_key(right, &disk_key, 0);
3792 btrfs_set_node_key(upper, &disk_key, slot + 1);
3793 btrfs_mark_buffer_dirty(upper);
3794
3795 /* then fixup the leaf pointer in the path */
3796 if (path->slots[0] >= left_nritems) {
3797 path->slots[0] -= left_nritems;
3798 if (btrfs_header_nritems(path->nodes[0]) == 0)
3799 clean_tree_block(fs_info, path->nodes[0]);
3800 btrfs_tree_unlock(path->nodes[0]);
3801 free_extent_buffer(path->nodes[0]);
3802 path->nodes[0] = right;
3803 path->slots[1] += 1;
3804 } else {
3805 btrfs_tree_unlock(right);
3806 free_extent_buffer(right);
3807 }
3808 return 0;
3809
3810 out_unlock:
3811 btrfs_tree_unlock(right);
3812 free_extent_buffer(right);
3813 return 1;
3814 }
3815
3816 /*
3817 * push some data in the path leaf to the right, trying to free up at
3818 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3819 *
3820 * returns 1 if the push failed because the other node didn't have enough
3821 * room, 0 if everything worked out and < 0 if there were major errors.
3822 *
3823 * this will push starting from min_slot to the end of the leaf. It won't
3824 * push any slot lower than min_slot
3825 */
push_leaf_right(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int min_data_size,int data_size,int empty,u32 min_slot)3826 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3827 *root, struct btrfs_path *path,
3828 int min_data_size, int data_size,
3829 int empty, u32 min_slot)
3830 {
3831 struct btrfs_fs_info *fs_info = root->fs_info;
3832 struct extent_buffer *left = path->nodes[0];
3833 struct extent_buffer *right;
3834 struct extent_buffer *upper;
3835 int slot;
3836 int free_space;
3837 u32 left_nritems;
3838 int ret;
3839
3840 if (!path->nodes[1])
3841 return 1;
3842
3843 slot = path->slots[1];
3844 upper = path->nodes[1];
3845 if (slot >= btrfs_header_nritems(upper) - 1)
3846 return 1;
3847
3848 btrfs_assert_tree_locked(path->nodes[1]);
3849
3850 right = read_node_slot(fs_info, upper, slot + 1);
3851 /*
3852 * slot + 1 is not valid or we fail to read the right node,
3853 * no big deal, just return.
3854 */
3855 if (IS_ERR(right))
3856 return 1;
3857
3858 btrfs_tree_lock(right);
3859 btrfs_set_lock_blocking(right);
3860
3861 free_space = btrfs_leaf_free_space(fs_info, right);
3862 if (free_space < data_size)
3863 goto out_unlock;
3864
3865 /* cow and double check */
3866 ret = btrfs_cow_block(trans, root, right, upper,
3867 slot + 1, &right);
3868 if (ret)
3869 goto out_unlock;
3870
3871 free_space = btrfs_leaf_free_space(fs_info, right);
3872 if (free_space < data_size)
3873 goto out_unlock;
3874
3875 left_nritems = btrfs_header_nritems(left);
3876 if (left_nritems == 0)
3877 goto out_unlock;
3878
3879 if (path->slots[0] == left_nritems && !empty) {
3880 /* Key greater than all keys in the leaf, right neighbor has
3881 * enough room for it and we're not emptying our leaf to delete
3882 * it, therefore use right neighbor to insert the new item and
3883 * no need to touch/dirty our left leaft. */
3884 btrfs_tree_unlock(left);
3885 free_extent_buffer(left);
3886 path->nodes[0] = right;
3887 path->slots[0] = 0;
3888 path->slots[1]++;
3889 return 0;
3890 }
3891
3892 return __push_leaf_right(fs_info, path, min_data_size, empty,
3893 right, free_space, left_nritems, min_slot);
3894 out_unlock:
3895 btrfs_tree_unlock(right);
3896 free_extent_buffer(right);
3897 return 1;
3898 }
3899
3900 /*
3901 * push some data in the path leaf to the left, trying to free up at
3902 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3903 *
3904 * max_slot can put a limit on how far into the leaf we'll push items. The
3905 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3906 * items
3907 */
__push_leaf_left(struct btrfs_fs_info * fs_info,struct btrfs_path * path,int data_size,int empty,struct extent_buffer * left,int free_space,u32 right_nritems,u32 max_slot)3908 static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3909 struct btrfs_path *path, int data_size,
3910 int empty, struct extent_buffer *left,
3911 int free_space, u32 right_nritems,
3912 u32 max_slot)
3913 {
3914 struct btrfs_disk_key disk_key;
3915 struct extent_buffer *right = path->nodes[0];
3916 int i;
3917 int push_space = 0;
3918 int push_items = 0;
3919 struct btrfs_item *item;
3920 u32 old_left_nritems;
3921 u32 nr;
3922 int ret = 0;
3923 u32 this_item_size;
3924 u32 old_left_item_size;
3925 struct btrfs_map_token token;
3926
3927 btrfs_init_map_token(&token);
3928
3929 if (empty)
3930 nr = min(right_nritems, max_slot);
3931 else
3932 nr = min(right_nritems - 1, max_slot);
3933
3934 for (i = 0; i < nr; i++) {
3935 item = btrfs_item_nr(i);
3936
3937 if (!empty && push_items > 0) {
3938 if (path->slots[0] < i)
3939 break;
3940 if (path->slots[0] == i) {
3941 int space = btrfs_leaf_free_space(fs_info, right);
3942 if (space + push_space * 2 > free_space)
3943 break;
3944 }
3945 }
3946
3947 if (path->slots[0] == i)
3948 push_space += data_size;
3949
3950 this_item_size = btrfs_item_size(right, item);
3951 if (this_item_size + sizeof(*item) + push_space > free_space)
3952 break;
3953
3954 push_items++;
3955 push_space += this_item_size + sizeof(*item);
3956 }
3957
3958 if (push_items == 0) {
3959 ret = 1;
3960 goto out;
3961 }
3962 WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3963
3964 /* push data from right to left */
3965 copy_extent_buffer(left, right,
3966 btrfs_item_nr_offset(btrfs_header_nritems(left)),
3967 btrfs_item_nr_offset(0),
3968 push_items * sizeof(struct btrfs_item));
3969
3970 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3971 btrfs_item_offset_nr(right, push_items - 1);
3972
3973 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3974 leaf_data_end(fs_info, left) - push_space,
3975 BTRFS_LEAF_DATA_OFFSET +
3976 btrfs_item_offset_nr(right, push_items - 1),
3977 push_space);
3978 old_left_nritems = btrfs_header_nritems(left);
3979 BUG_ON(old_left_nritems <= 0);
3980
3981 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3982 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3983 u32 ioff;
3984
3985 item = btrfs_item_nr(i);
3986
3987 ioff = btrfs_token_item_offset(left, item, &token);
3988 btrfs_set_token_item_offset(left, item,
3989 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3990 &token);
3991 }
3992 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3993
3994 /* fixup right node */
3995 if (push_items > right_nritems)
3996 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3997 right_nritems);
3998
3999 if (push_items < right_nritems) {
4000 push_space = btrfs_item_offset_nr(right, push_items - 1) -
4001 leaf_data_end(fs_info, right);
4002 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
4003 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
4004 BTRFS_LEAF_DATA_OFFSET +
4005 leaf_data_end(fs_info, right), push_space);
4006
4007 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
4008 btrfs_item_nr_offset(push_items),
4009 (btrfs_header_nritems(right) - push_items) *
4010 sizeof(struct btrfs_item));
4011 }
4012 right_nritems -= push_items;
4013 btrfs_set_header_nritems(right, right_nritems);
4014 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
4015 for (i = 0; i < right_nritems; i++) {
4016 item = btrfs_item_nr(i);
4017
4018 push_space = push_space - btrfs_token_item_size(right,
4019 item, &token);
4020 btrfs_set_token_item_offset(right, item, push_space, &token);
4021 }
4022
4023 btrfs_mark_buffer_dirty(left);
4024 if (right_nritems)
4025 btrfs_mark_buffer_dirty(right);
4026 else
4027 clean_tree_block(fs_info, right);
4028
4029 btrfs_item_key(right, &disk_key, 0);
4030 fixup_low_keys(path, &disk_key, 1);
4031
4032 /* then fixup the leaf pointer in the path */
4033 if (path->slots[0] < push_items) {
4034 path->slots[0] += old_left_nritems;
4035 btrfs_tree_unlock(path->nodes[0]);
4036 free_extent_buffer(path->nodes[0]);
4037 path->nodes[0] = left;
4038 path->slots[1] -= 1;
4039 } else {
4040 btrfs_tree_unlock(left);
4041 free_extent_buffer(left);
4042 path->slots[0] -= push_items;
4043 }
4044 BUG_ON(path->slots[0] < 0);
4045 return ret;
4046 out:
4047 btrfs_tree_unlock(left);
4048 free_extent_buffer(left);
4049 return ret;
4050 }
4051
4052 /*
4053 * push some data in the path leaf to the left, trying to free up at
4054 * least data_size bytes. returns zero if the push worked, nonzero otherwise
4055 *
4056 * max_slot can put a limit on how far into the leaf we'll push items. The
4057 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
4058 * items
4059 */
push_leaf_left(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int min_data_size,int data_size,int empty,u32 max_slot)4060 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4061 *root, struct btrfs_path *path, int min_data_size,
4062 int data_size, int empty, u32 max_slot)
4063 {
4064 struct btrfs_fs_info *fs_info = root->fs_info;
4065 struct extent_buffer *right = path->nodes[0];
4066 struct extent_buffer *left;
4067 int slot;
4068 int free_space;
4069 u32 right_nritems;
4070 int ret = 0;
4071
4072 slot = path->slots[1];
4073 if (slot == 0)
4074 return 1;
4075 if (!path->nodes[1])
4076 return 1;
4077
4078 right_nritems = btrfs_header_nritems(right);
4079 if (right_nritems == 0)
4080 return 1;
4081
4082 btrfs_assert_tree_locked(path->nodes[1]);
4083
4084 left = read_node_slot(fs_info, path->nodes[1], slot - 1);
4085 /*
4086 * slot - 1 is not valid or we fail to read the left node,
4087 * no big deal, just return.
4088 */
4089 if (IS_ERR(left))
4090 return 1;
4091
4092 btrfs_tree_lock(left);
4093 btrfs_set_lock_blocking(left);
4094
4095 free_space = btrfs_leaf_free_space(fs_info, left);
4096 if (free_space < data_size) {
4097 ret = 1;
4098 goto out;
4099 }
4100
4101 /* cow and double check */
4102 ret = btrfs_cow_block(trans, root, left,
4103 path->nodes[1], slot - 1, &left);
4104 if (ret) {
4105 /* we hit -ENOSPC, but it isn't fatal here */
4106 if (ret == -ENOSPC)
4107 ret = 1;
4108 goto out;
4109 }
4110
4111 free_space = btrfs_leaf_free_space(fs_info, left);
4112 if (free_space < data_size) {
4113 ret = 1;
4114 goto out;
4115 }
4116
4117 return __push_leaf_left(fs_info, path, min_data_size,
4118 empty, left, free_space, right_nritems,
4119 max_slot);
4120 out:
4121 btrfs_tree_unlock(left);
4122 free_extent_buffer(left);
4123 return ret;
4124 }
4125
4126 /*
4127 * split the path's leaf in two, making sure there is at least data_size
4128 * available for the resulting leaf level of the path.
4129 */
copy_for_split(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info,struct btrfs_path * path,struct extent_buffer * l,struct extent_buffer * right,int slot,int mid,int nritems)4130 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4131 struct btrfs_fs_info *fs_info,
4132 struct btrfs_path *path,
4133 struct extent_buffer *l,
4134 struct extent_buffer *right,
4135 int slot, int mid, int nritems)
4136 {
4137 int data_copy_size;
4138 int rt_data_off;
4139 int i;
4140 struct btrfs_disk_key disk_key;
4141 struct btrfs_map_token token;
4142
4143 btrfs_init_map_token(&token);
4144
4145 nritems = nritems - mid;
4146 btrfs_set_header_nritems(right, nritems);
4147 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
4148
4149 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4150 btrfs_item_nr_offset(mid),
4151 nritems * sizeof(struct btrfs_item));
4152
4153 copy_extent_buffer(right, l,
4154 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4155 data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4156 leaf_data_end(fs_info, l), data_copy_size);
4157
4158 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4159
4160 for (i = 0; i < nritems; i++) {
4161 struct btrfs_item *item = btrfs_item_nr(i);
4162 u32 ioff;
4163
4164 ioff = btrfs_token_item_offset(right, item, &token);
4165 btrfs_set_token_item_offset(right, item,
4166 ioff + rt_data_off, &token);
4167 }
4168
4169 btrfs_set_header_nritems(l, mid);
4170 btrfs_item_key(right, &disk_key, 0);
4171 insert_ptr(trans, fs_info, path, &disk_key, right->start,
4172 path->slots[1] + 1, 1);
4173
4174 btrfs_mark_buffer_dirty(right);
4175 btrfs_mark_buffer_dirty(l);
4176 BUG_ON(path->slots[0] != slot);
4177
4178 if (mid <= slot) {
4179 btrfs_tree_unlock(path->nodes[0]);
4180 free_extent_buffer(path->nodes[0]);
4181 path->nodes[0] = right;
4182 path->slots[0] -= mid;
4183 path->slots[1] += 1;
4184 } else {
4185 btrfs_tree_unlock(right);
4186 free_extent_buffer(right);
4187 }
4188
4189 BUG_ON(path->slots[0] < 0);
4190 }
4191
4192 /*
4193 * double splits happen when we need to insert a big item in the middle
4194 * of a leaf. A double split can leave us with 3 mostly empty leaves:
4195 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4196 * A B C
4197 *
4198 * We avoid this by trying to push the items on either side of our target
4199 * into the adjacent leaves. If all goes well we can avoid the double split
4200 * completely.
4201 */
push_for_double_split(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int data_size)4202 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4203 struct btrfs_root *root,
4204 struct btrfs_path *path,
4205 int data_size)
4206 {
4207 struct btrfs_fs_info *fs_info = root->fs_info;
4208 int ret;
4209 int progress = 0;
4210 int slot;
4211 u32 nritems;
4212 int space_needed = data_size;
4213
4214 slot = path->slots[0];
4215 if (slot < btrfs_header_nritems(path->nodes[0]))
4216 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4217
4218 /*
4219 * try to push all the items after our slot into the
4220 * right leaf
4221 */
4222 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4223 if (ret < 0)
4224 return ret;
4225
4226 if (ret == 0)
4227 progress++;
4228
4229 nritems = btrfs_header_nritems(path->nodes[0]);
4230 /*
4231 * our goal is to get our slot at the start or end of a leaf. If
4232 * we've done so we're done
4233 */
4234 if (path->slots[0] == 0 || path->slots[0] == nritems)
4235 return 0;
4236
4237 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4238 return 0;
4239
4240 /* try to push all the items before our slot into the next leaf */
4241 slot = path->slots[0];
4242 space_needed = data_size;
4243 if (slot > 0)
4244 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4245 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4246 if (ret < 0)
4247 return ret;
4248
4249 if (ret == 0)
4250 progress++;
4251
4252 if (progress)
4253 return 0;
4254 return 1;
4255 }
4256
4257 /*
4258 * split the path's leaf in two, making sure there is at least data_size
4259 * available for the resulting leaf level of the path.
4260 *
4261 * returns 0 if all went well and < 0 on failure.
4262 */
split_leaf(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct btrfs_key * ins_key,struct btrfs_path * path,int data_size,int extend)4263 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4264 struct btrfs_root *root,
4265 const struct btrfs_key *ins_key,
4266 struct btrfs_path *path, int data_size,
4267 int extend)
4268 {
4269 struct btrfs_disk_key disk_key;
4270 struct extent_buffer *l;
4271 u32 nritems;
4272 int mid;
4273 int slot;
4274 struct extent_buffer *right;
4275 struct btrfs_fs_info *fs_info = root->fs_info;
4276 int ret = 0;
4277 int wret;
4278 int split;
4279 int num_doubles = 0;
4280 int tried_avoid_double = 0;
4281
4282 l = path->nodes[0];
4283 slot = path->slots[0];
4284 if (extend && data_size + btrfs_item_size_nr(l, slot) +
4285 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4286 return -EOVERFLOW;
4287
4288 /* first try to make some room by pushing left and right */
4289 if (data_size && path->nodes[1]) {
4290 int space_needed = data_size;
4291
4292 if (slot < btrfs_header_nritems(l))
4293 space_needed -= btrfs_leaf_free_space(fs_info, l);
4294
4295 wret = push_leaf_right(trans, root, path, space_needed,
4296 space_needed, 0, 0);
4297 if (wret < 0)
4298 return wret;
4299 if (wret) {
4300 space_needed = data_size;
4301 if (slot > 0)
4302 space_needed -= btrfs_leaf_free_space(fs_info,
4303 l);
4304 wret = push_leaf_left(trans, root, path, space_needed,
4305 space_needed, 0, (u32)-1);
4306 if (wret < 0)
4307 return wret;
4308 }
4309 l = path->nodes[0];
4310
4311 /* did the pushes work? */
4312 if (btrfs_leaf_free_space(fs_info, l) >= data_size)
4313 return 0;
4314 }
4315
4316 if (!path->nodes[1]) {
4317 ret = insert_new_root(trans, root, path, 1);
4318 if (ret)
4319 return ret;
4320 }
4321 again:
4322 split = 1;
4323 l = path->nodes[0];
4324 slot = path->slots[0];
4325 nritems = btrfs_header_nritems(l);
4326 mid = (nritems + 1) / 2;
4327
4328 if (mid <= slot) {
4329 if (nritems == 1 ||
4330 leaf_space_used(l, mid, nritems - mid) + data_size >
4331 BTRFS_LEAF_DATA_SIZE(fs_info)) {
4332 if (slot >= nritems) {
4333 split = 0;
4334 } else {
4335 mid = slot;
4336 if (mid != nritems &&
4337 leaf_space_used(l, mid, nritems - mid) +
4338 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4339 if (data_size && !tried_avoid_double)
4340 goto push_for_double;
4341 split = 2;
4342 }
4343 }
4344 }
4345 } else {
4346 if (leaf_space_used(l, 0, mid) + data_size >
4347 BTRFS_LEAF_DATA_SIZE(fs_info)) {
4348 if (!extend && data_size && slot == 0) {
4349 split = 0;
4350 } else if ((extend || !data_size) && slot == 0) {
4351 mid = 1;
4352 } else {
4353 mid = slot;
4354 if (mid != nritems &&
4355 leaf_space_used(l, mid, nritems - mid) +
4356 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4357 if (data_size && !tried_avoid_double)
4358 goto push_for_double;
4359 split = 2;
4360 }
4361 }
4362 }
4363 }
4364
4365 if (split == 0)
4366 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4367 else
4368 btrfs_item_key(l, &disk_key, mid);
4369
4370 right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4371 l->start, 0);
4372 if (IS_ERR(right))
4373 return PTR_ERR(right);
4374
4375 root_add_used(root, fs_info->nodesize);
4376
4377 if (split == 0) {
4378 if (mid <= slot) {
4379 btrfs_set_header_nritems(right, 0);
4380 insert_ptr(trans, fs_info, path, &disk_key,
4381 right->start, path->slots[1] + 1, 1);
4382 btrfs_tree_unlock(path->nodes[0]);
4383 free_extent_buffer(path->nodes[0]);
4384 path->nodes[0] = right;
4385 path->slots[0] = 0;
4386 path->slots[1] += 1;
4387 } else {
4388 btrfs_set_header_nritems(right, 0);
4389 insert_ptr(trans, fs_info, path, &disk_key,
4390 right->start, path->slots[1], 1);
4391 btrfs_tree_unlock(path->nodes[0]);
4392 free_extent_buffer(path->nodes[0]);
4393 path->nodes[0] = right;
4394 path->slots[0] = 0;
4395 if (path->slots[1] == 0)
4396 fixup_low_keys(path, &disk_key, 1);
4397 }
4398 /*
4399 * We create a new leaf 'right' for the required ins_len and
4400 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4401 * the content of ins_len to 'right'.
4402 */
4403 return ret;
4404 }
4405
4406 copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
4407
4408 if (split == 2) {
4409 BUG_ON(num_doubles != 0);
4410 num_doubles++;
4411 goto again;
4412 }
4413
4414 return 0;
4415
4416 push_for_double:
4417 push_for_double_split(trans, root, path, data_size);
4418 tried_avoid_double = 1;
4419 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4420 return 0;
4421 goto again;
4422 }
4423
setup_leaf_for_split(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int ins_len)4424 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4425 struct btrfs_root *root,
4426 struct btrfs_path *path, int ins_len)
4427 {
4428 struct btrfs_fs_info *fs_info = root->fs_info;
4429 struct btrfs_key key;
4430 struct extent_buffer *leaf;
4431 struct btrfs_file_extent_item *fi;
4432 u64 extent_len = 0;
4433 u32 item_size;
4434 int ret;
4435
4436 leaf = path->nodes[0];
4437 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4438
4439 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4440 key.type != BTRFS_EXTENT_CSUM_KEY);
4441
4442 if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
4443 return 0;
4444
4445 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4446 if (key.type == BTRFS_EXTENT_DATA_KEY) {
4447 fi = btrfs_item_ptr(leaf, path->slots[0],
4448 struct btrfs_file_extent_item);
4449 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4450 }
4451 btrfs_release_path(path);
4452
4453 path->keep_locks = 1;
4454 path->search_for_split = 1;
4455 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4456 path->search_for_split = 0;
4457 if (ret > 0)
4458 ret = -EAGAIN;
4459 if (ret < 0)
4460 goto err;
4461
4462 ret = -EAGAIN;
4463 leaf = path->nodes[0];
4464 /* if our item isn't there, return now */
4465 if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4466 goto err;
4467
4468 /* the leaf has changed, it now has room. return now */
4469 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
4470 goto err;
4471
4472 if (key.type == BTRFS_EXTENT_DATA_KEY) {
4473 fi = btrfs_item_ptr(leaf, path->slots[0],
4474 struct btrfs_file_extent_item);
4475 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4476 goto err;
4477 }
4478
4479 btrfs_set_path_blocking(path);
4480 ret = split_leaf(trans, root, &key, path, ins_len, 1);
4481 if (ret)
4482 goto err;
4483
4484 path->keep_locks = 0;
4485 btrfs_unlock_up_safe(path, 1);
4486 return 0;
4487 err:
4488 path->keep_locks = 0;
4489 return ret;
4490 }
4491
split_item(struct btrfs_fs_info * fs_info,struct btrfs_path * path,const struct btrfs_key * new_key,unsigned long split_offset)4492 static noinline int split_item(struct btrfs_fs_info *fs_info,
4493 struct btrfs_path *path,
4494 const struct btrfs_key *new_key,
4495 unsigned long split_offset)
4496 {
4497 struct extent_buffer *leaf;
4498 struct btrfs_item *item;
4499 struct btrfs_item *new_item;
4500 int slot;
4501 char *buf;
4502 u32 nritems;
4503 u32 item_size;
4504 u32 orig_offset;
4505 struct btrfs_disk_key disk_key;
4506
4507 leaf = path->nodes[0];
4508 BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
4509
4510 btrfs_set_path_blocking(path);
4511
4512 item = btrfs_item_nr(path->slots[0]);
4513 orig_offset = btrfs_item_offset(leaf, item);
4514 item_size = btrfs_item_size(leaf, item);
4515
4516 buf = kmalloc(item_size, GFP_NOFS);
4517 if (!buf)
4518 return -ENOMEM;
4519
4520 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4521 path->slots[0]), item_size);
4522
4523 slot = path->slots[0] + 1;
4524 nritems = btrfs_header_nritems(leaf);
4525 if (slot != nritems) {
4526 /* shift the items */
4527 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4528 btrfs_item_nr_offset(slot),
4529 (nritems - slot) * sizeof(struct btrfs_item));
4530 }
4531
4532 btrfs_cpu_key_to_disk(&disk_key, new_key);
4533 btrfs_set_item_key(leaf, &disk_key, slot);
4534
4535 new_item = btrfs_item_nr(slot);
4536
4537 btrfs_set_item_offset(leaf, new_item, orig_offset);
4538 btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4539
4540 btrfs_set_item_offset(leaf, item,
4541 orig_offset + item_size - split_offset);
4542 btrfs_set_item_size(leaf, item, split_offset);
4543
4544 btrfs_set_header_nritems(leaf, nritems + 1);
4545
4546 /* write the data for the start of the original item */
4547 write_extent_buffer(leaf, buf,
4548 btrfs_item_ptr_offset(leaf, path->slots[0]),
4549 split_offset);
4550
4551 /* write the data for the new item */
4552 write_extent_buffer(leaf, buf + split_offset,
4553 btrfs_item_ptr_offset(leaf, slot),
4554 item_size - split_offset);
4555 btrfs_mark_buffer_dirty(leaf);
4556
4557 BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
4558 kfree(buf);
4559 return 0;
4560 }
4561
4562 /*
4563 * This function splits a single item into two items,
4564 * giving 'new_key' to the new item and splitting the
4565 * old one at split_offset (from the start of the item).
4566 *
4567 * The path may be released by this operation. After
4568 * the split, the path is pointing to the old item. The
4569 * new item is going to be in the same node as the old one.
4570 *
4571 * Note, the item being split must be smaller enough to live alone on
4572 * a tree block with room for one extra struct btrfs_item
4573 *
4574 * This allows us to split the item in place, keeping a lock on the
4575 * leaf the entire time.
4576 */
btrfs_split_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * new_key,unsigned long split_offset)4577 int btrfs_split_item(struct btrfs_trans_handle *trans,
4578 struct btrfs_root *root,
4579 struct btrfs_path *path,
4580 const struct btrfs_key *new_key,
4581 unsigned long split_offset)
4582 {
4583 int ret;
4584 ret = setup_leaf_for_split(trans, root, path,
4585 sizeof(struct btrfs_item));
4586 if (ret)
4587 return ret;
4588
4589 ret = split_item(root->fs_info, path, new_key, split_offset);
4590 return ret;
4591 }
4592
4593 /*
4594 * This function duplicate a item, giving 'new_key' to the new item.
4595 * It guarantees both items live in the same tree leaf and the new item
4596 * is contiguous with the original item.
4597 *
4598 * This allows us to split file extent in place, keeping a lock on the
4599 * leaf the entire time.
4600 */
btrfs_duplicate_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * new_key)4601 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4602 struct btrfs_root *root,
4603 struct btrfs_path *path,
4604 const struct btrfs_key *new_key)
4605 {
4606 struct extent_buffer *leaf;
4607 int ret;
4608 u32 item_size;
4609
4610 leaf = path->nodes[0];
4611 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4612 ret = setup_leaf_for_split(trans, root, path,
4613 item_size + sizeof(struct btrfs_item));
4614 if (ret)
4615 return ret;
4616
4617 path->slots[0]++;
4618 setup_items_for_insert(root, path, new_key, &item_size,
4619 item_size, item_size +
4620 sizeof(struct btrfs_item), 1);
4621 leaf = path->nodes[0];
4622 memcpy_extent_buffer(leaf,
4623 btrfs_item_ptr_offset(leaf, path->slots[0]),
4624 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4625 item_size);
4626 return 0;
4627 }
4628
4629 /*
4630 * make the item pointed to by the path smaller. new_size indicates
4631 * how small to make it, and from_end tells us if we just chop bytes
4632 * off the end of the item or if we shift the item to chop bytes off
4633 * the front.
4634 */
btrfs_truncate_item(struct btrfs_fs_info * fs_info,struct btrfs_path * path,u32 new_size,int from_end)4635 void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4636 struct btrfs_path *path, u32 new_size, int from_end)
4637 {
4638 int slot;
4639 struct extent_buffer *leaf;
4640 struct btrfs_item *item;
4641 u32 nritems;
4642 unsigned int data_end;
4643 unsigned int old_data_start;
4644 unsigned int old_size;
4645 unsigned int size_diff;
4646 int i;
4647 struct btrfs_map_token token;
4648
4649 btrfs_init_map_token(&token);
4650
4651 leaf = path->nodes[0];
4652 slot = path->slots[0];
4653
4654 old_size = btrfs_item_size_nr(leaf, slot);
4655 if (old_size == new_size)
4656 return;
4657
4658 nritems = btrfs_header_nritems(leaf);
4659 data_end = leaf_data_end(fs_info, leaf);
4660
4661 old_data_start = btrfs_item_offset_nr(leaf, slot);
4662
4663 size_diff = old_size - new_size;
4664
4665 BUG_ON(slot < 0);
4666 BUG_ON(slot >= nritems);
4667
4668 /*
4669 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4670 */
4671 /* first correct the data pointers */
4672 for (i = slot; i < nritems; i++) {
4673 u32 ioff;
4674 item = btrfs_item_nr(i);
4675
4676 ioff = btrfs_token_item_offset(leaf, item, &token);
4677 btrfs_set_token_item_offset(leaf, item,
4678 ioff + size_diff, &token);
4679 }
4680
4681 /* shift the data */
4682 if (from_end) {
4683 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4684 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4685 data_end, old_data_start + new_size - data_end);
4686 } else {
4687 struct btrfs_disk_key disk_key;
4688 u64 offset;
4689
4690 btrfs_item_key(leaf, &disk_key, slot);
4691
4692 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4693 unsigned long ptr;
4694 struct btrfs_file_extent_item *fi;
4695
4696 fi = btrfs_item_ptr(leaf, slot,
4697 struct btrfs_file_extent_item);
4698 fi = (struct btrfs_file_extent_item *)(
4699 (unsigned long)fi - size_diff);
4700
4701 if (btrfs_file_extent_type(leaf, fi) ==
4702 BTRFS_FILE_EXTENT_INLINE) {
4703 ptr = btrfs_item_ptr_offset(leaf, slot);
4704 memmove_extent_buffer(leaf, ptr,
4705 (unsigned long)fi,
4706 BTRFS_FILE_EXTENT_INLINE_DATA_START);
4707 }
4708 }
4709
4710 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4711 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4712 data_end, old_data_start - data_end);
4713
4714 offset = btrfs_disk_key_offset(&disk_key);
4715 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4716 btrfs_set_item_key(leaf, &disk_key, slot);
4717 if (slot == 0)
4718 fixup_low_keys(path, &disk_key, 1);
4719 }
4720
4721 item = btrfs_item_nr(slot);
4722 btrfs_set_item_size(leaf, item, new_size);
4723 btrfs_mark_buffer_dirty(leaf);
4724
4725 if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4726 btrfs_print_leaf(leaf);
4727 BUG();
4728 }
4729 }
4730
4731 /*
4732 * make the item pointed to by the path bigger, data_size is the added size.
4733 */
btrfs_extend_item(struct btrfs_fs_info * fs_info,struct btrfs_path * path,u32 data_size)4734 void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4735 u32 data_size)
4736 {
4737 int slot;
4738 struct extent_buffer *leaf;
4739 struct btrfs_item *item;
4740 u32 nritems;
4741 unsigned int data_end;
4742 unsigned int old_data;
4743 unsigned int old_size;
4744 int i;
4745 struct btrfs_map_token token;
4746
4747 btrfs_init_map_token(&token);
4748
4749 leaf = path->nodes[0];
4750
4751 nritems = btrfs_header_nritems(leaf);
4752 data_end = leaf_data_end(fs_info, leaf);
4753
4754 if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
4755 btrfs_print_leaf(leaf);
4756 BUG();
4757 }
4758 slot = path->slots[0];
4759 old_data = btrfs_item_end_nr(leaf, slot);
4760
4761 BUG_ON(slot < 0);
4762 if (slot >= nritems) {
4763 btrfs_print_leaf(leaf);
4764 btrfs_crit(fs_info, "slot %d too large, nritems %d",
4765 slot, nritems);
4766 BUG_ON(1);
4767 }
4768
4769 /*
4770 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4771 */
4772 /* first correct the data pointers */
4773 for (i = slot; i < nritems; i++) {
4774 u32 ioff;
4775 item = btrfs_item_nr(i);
4776
4777 ioff = btrfs_token_item_offset(leaf, item, &token);
4778 btrfs_set_token_item_offset(leaf, item,
4779 ioff - data_size, &token);
4780 }
4781
4782 /* shift the data */
4783 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4784 data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4785 data_end, old_data - data_end);
4786
4787 data_end = old_data;
4788 old_size = btrfs_item_size_nr(leaf, slot);
4789 item = btrfs_item_nr(slot);
4790 btrfs_set_item_size(leaf, item, old_size + data_size);
4791 btrfs_mark_buffer_dirty(leaf);
4792
4793 if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4794 btrfs_print_leaf(leaf);
4795 BUG();
4796 }
4797 }
4798
4799 /*
4800 * this is a helper for btrfs_insert_empty_items, the main goal here is
4801 * to save stack depth by doing the bulk of the work in a function
4802 * that doesn't call btrfs_search_slot
4803 */
setup_items_for_insert(struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * cpu_key,u32 * data_size,u32 total_data,u32 total_size,int nr)4804 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4805 const struct btrfs_key *cpu_key, u32 *data_size,
4806 u32 total_data, u32 total_size, int nr)
4807 {
4808 struct btrfs_fs_info *fs_info = root->fs_info;
4809 struct btrfs_item *item;
4810 int i;
4811 u32 nritems;
4812 unsigned int data_end;
4813 struct btrfs_disk_key disk_key;
4814 struct extent_buffer *leaf;
4815 int slot;
4816 struct btrfs_map_token token;
4817
4818 if (path->slots[0] == 0) {
4819 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4820 fixup_low_keys(path, &disk_key, 1);
4821 }
4822 btrfs_unlock_up_safe(path, 1);
4823
4824 btrfs_init_map_token(&token);
4825
4826 leaf = path->nodes[0];
4827 slot = path->slots[0];
4828
4829 nritems = btrfs_header_nritems(leaf);
4830 data_end = leaf_data_end(fs_info, leaf);
4831
4832 if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
4833 btrfs_print_leaf(leaf);
4834 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4835 total_size, btrfs_leaf_free_space(fs_info, leaf));
4836 BUG();
4837 }
4838
4839 if (slot != nritems) {
4840 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4841
4842 if (old_data < data_end) {
4843 btrfs_print_leaf(leaf);
4844 btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4845 slot, old_data, data_end);
4846 BUG_ON(1);
4847 }
4848 /*
4849 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4850 */
4851 /* first correct the data pointers */
4852 for (i = slot; i < nritems; i++) {
4853 u32 ioff;
4854
4855 item = btrfs_item_nr(i);
4856 ioff = btrfs_token_item_offset(leaf, item, &token);
4857 btrfs_set_token_item_offset(leaf, item,
4858 ioff - total_data, &token);
4859 }
4860 /* shift the items */
4861 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4862 btrfs_item_nr_offset(slot),
4863 (nritems - slot) * sizeof(struct btrfs_item));
4864
4865 /* shift the data */
4866 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4867 data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4868 data_end, old_data - data_end);
4869 data_end = old_data;
4870 }
4871
4872 /* setup the item for the new data */
4873 for (i = 0; i < nr; i++) {
4874 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4875 btrfs_set_item_key(leaf, &disk_key, slot + i);
4876 item = btrfs_item_nr(slot + i);
4877 btrfs_set_token_item_offset(leaf, item,
4878 data_end - data_size[i], &token);
4879 data_end -= data_size[i];
4880 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4881 }
4882
4883 btrfs_set_header_nritems(leaf, nritems + nr);
4884 btrfs_mark_buffer_dirty(leaf);
4885
4886 if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4887 btrfs_print_leaf(leaf);
4888 BUG();
4889 }
4890 }
4891
4892 /*
4893 * Given a key and some data, insert items into the tree.
4894 * This does all the path init required, making room in the tree if needed.
4895 */
btrfs_insert_empty_items(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * cpu_key,u32 * data_size,int nr)4896 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4897 struct btrfs_root *root,
4898 struct btrfs_path *path,
4899 const struct btrfs_key *cpu_key, u32 *data_size,
4900 int nr)
4901 {
4902 int ret = 0;
4903 int slot;
4904 int i;
4905 u32 total_size = 0;
4906 u32 total_data = 0;
4907
4908 for (i = 0; i < nr; i++)
4909 total_data += data_size[i];
4910
4911 total_size = total_data + (nr * sizeof(struct btrfs_item));
4912 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4913 if (ret == 0)
4914 return -EEXIST;
4915 if (ret < 0)
4916 return ret;
4917
4918 slot = path->slots[0];
4919 BUG_ON(slot < 0);
4920
4921 setup_items_for_insert(root, path, cpu_key, data_size,
4922 total_data, total_size, nr);
4923 return 0;
4924 }
4925
4926 /*
4927 * Given a key and some data, insert an item into the tree.
4928 * This does all the path init required, making room in the tree if needed.
4929 */
btrfs_insert_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct btrfs_key * cpu_key,void * data,u32 data_size)4930 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4931 const struct btrfs_key *cpu_key, void *data,
4932 u32 data_size)
4933 {
4934 int ret = 0;
4935 struct btrfs_path *path;
4936 struct extent_buffer *leaf;
4937 unsigned long ptr;
4938
4939 path = btrfs_alloc_path();
4940 if (!path)
4941 return -ENOMEM;
4942 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4943 if (!ret) {
4944 leaf = path->nodes[0];
4945 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4946 write_extent_buffer(leaf, data, ptr, data_size);
4947 btrfs_mark_buffer_dirty(leaf);
4948 }
4949 btrfs_free_path(path);
4950 return ret;
4951 }
4952
4953 /*
4954 * delete the pointer from a given node.
4955 *
4956 * the tree should have been previously balanced so the deletion does not
4957 * empty a node.
4958 */
del_ptr(struct btrfs_root * root,struct btrfs_path * path,int level,int slot)4959 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4960 int level, int slot)
4961 {
4962 struct extent_buffer *parent = path->nodes[level];
4963 u32 nritems;
4964 int ret;
4965
4966 nritems = btrfs_header_nritems(parent);
4967 if (slot != nritems - 1) {
4968 if (level) {
4969 ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4970 nritems - slot - 1);
4971 BUG_ON(ret < 0);
4972 }
4973 memmove_extent_buffer(parent,
4974 btrfs_node_key_ptr_offset(slot),
4975 btrfs_node_key_ptr_offset(slot + 1),
4976 sizeof(struct btrfs_key_ptr) *
4977 (nritems - slot - 1));
4978 } else if (level) {
4979 ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4980 GFP_NOFS);
4981 BUG_ON(ret < 0);
4982 }
4983
4984 nritems--;
4985 btrfs_set_header_nritems(parent, nritems);
4986 if (nritems == 0 && parent == root->node) {
4987 BUG_ON(btrfs_header_level(root->node) != 1);
4988 /* just turn the root into a leaf and break */
4989 btrfs_set_header_level(root->node, 0);
4990 } else if (slot == 0) {
4991 struct btrfs_disk_key disk_key;
4992
4993 btrfs_node_key(parent, &disk_key, 0);
4994 fixup_low_keys(path, &disk_key, level + 1);
4995 }
4996 btrfs_mark_buffer_dirty(parent);
4997 }
4998
4999 /*
5000 * a helper function to delete the leaf pointed to by path->slots[1] and
5001 * path->nodes[1].
5002 *
5003 * This deletes the pointer in path->nodes[1] and frees the leaf
5004 * block extent. zero is returned if it all worked out, < 0 otherwise.
5005 *
5006 * The path must have already been setup for deleting the leaf, including
5007 * all the proper balancing. path->nodes[1] must be locked.
5008 */
btrfs_del_leaf(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * leaf)5009 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
5010 struct btrfs_root *root,
5011 struct btrfs_path *path,
5012 struct extent_buffer *leaf)
5013 {
5014 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
5015 del_ptr(root, path, 1, path->slots[1]);
5016
5017 /*
5018 * btrfs_free_extent is expensive, we want to make sure we
5019 * aren't holding any locks when we call it
5020 */
5021 btrfs_unlock_up_safe(path, 0);
5022
5023 root_sub_used(root, leaf->len);
5024
5025 extent_buffer_get(leaf);
5026 btrfs_free_tree_block(trans, root, leaf, 0, 1);
5027 free_extent_buffer_stale(leaf);
5028 }
5029 /*
5030 * delete the item at the leaf level in path. If that empties
5031 * the leaf, remove it from the tree
5032 */
btrfs_del_items(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int slot,int nr)5033 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5034 struct btrfs_path *path, int slot, int nr)
5035 {
5036 struct btrfs_fs_info *fs_info = root->fs_info;
5037 struct extent_buffer *leaf;
5038 struct btrfs_item *item;
5039 u32 last_off;
5040 u32 dsize = 0;
5041 int ret = 0;
5042 int wret;
5043 int i;
5044 u32 nritems;
5045 struct btrfs_map_token token;
5046
5047 btrfs_init_map_token(&token);
5048
5049 leaf = path->nodes[0];
5050 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
5051
5052 for (i = 0; i < nr; i++)
5053 dsize += btrfs_item_size_nr(leaf, slot + i);
5054
5055 nritems = btrfs_header_nritems(leaf);
5056
5057 if (slot + nr != nritems) {
5058 int data_end = leaf_data_end(fs_info, leaf);
5059
5060 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
5061 data_end + dsize,
5062 BTRFS_LEAF_DATA_OFFSET + data_end,
5063 last_off - data_end);
5064
5065 for (i = slot + nr; i < nritems; i++) {
5066 u32 ioff;
5067
5068 item = btrfs_item_nr(i);
5069 ioff = btrfs_token_item_offset(leaf, item, &token);
5070 btrfs_set_token_item_offset(leaf, item,
5071 ioff + dsize, &token);
5072 }
5073
5074 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5075 btrfs_item_nr_offset(slot + nr),
5076 sizeof(struct btrfs_item) *
5077 (nritems - slot - nr));
5078 }
5079 btrfs_set_header_nritems(leaf, nritems - nr);
5080 nritems -= nr;
5081
5082 /* delete the leaf if we've emptied it */
5083 if (nritems == 0) {
5084 if (leaf == root->node) {
5085 btrfs_set_header_level(leaf, 0);
5086 } else {
5087 btrfs_set_path_blocking(path);
5088 clean_tree_block(fs_info, leaf);
5089 btrfs_del_leaf(trans, root, path, leaf);
5090 }
5091 } else {
5092 int used = leaf_space_used(leaf, 0, nritems);
5093 if (slot == 0) {
5094 struct btrfs_disk_key disk_key;
5095
5096 btrfs_item_key(leaf, &disk_key, 0);
5097 fixup_low_keys(path, &disk_key, 1);
5098 }
5099
5100 /* delete the leaf if it is mostly empty */
5101 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5102 /* push_leaf_left fixes the path.
5103 * make sure the path still points to our leaf
5104 * for possible call to del_ptr below
5105 */
5106 slot = path->slots[1];
5107 extent_buffer_get(leaf);
5108
5109 btrfs_set_path_blocking(path);
5110 wret = push_leaf_left(trans, root, path, 1, 1,
5111 1, (u32)-1);
5112 if (wret < 0 && wret != -ENOSPC)
5113 ret = wret;
5114
5115 if (path->nodes[0] == leaf &&
5116 btrfs_header_nritems(leaf)) {
5117 wret = push_leaf_right(trans, root, path, 1,
5118 1, 1, 0);
5119 if (wret < 0 && wret != -ENOSPC)
5120 ret = wret;
5121 }
5122
5123 if (btrfs_header_nritems(leaf) == 0) {
5124 path->slots[1] = slot;
5125 btrfs_del_leaf(trans, root, path, leaf);
5126 free_extent_buffer(leaf);
5127 ret = 0;
5128 } else {
5129 /* if we're still in the path, make sure
5130 * we're dirty. Otherwise, one of the
5131 * push_leaf functions must have already
5132 * dirtied this buffer
5133 */
5134 if (path->nodes[0] == leaf)
5135 btrfs_mark_buffer_dirty(leaf);
5136 free_extent_buffer(leaf);
5137 }
5138 } else {
5139 btrfs_mark_buffer_dirty(leaf);
5140 }
5141 }
5142 return ret;
5143 }
5144
5145 /*
5146 * search the tree again to find a leaf with lesser keys
5147 * returns 0 if it found something or 1 if there are no lesser leaves.
5148 * returns < 0 on io errors.
5149 *
5150 * This may release the path, and so you may lose any locks held at the
5151 * time you call it.
5152 */
btrfs_prev_leaf(struct btrfs_root * root,struct btrfs_path * path)5153 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5154 {
5155 struct btrfs_key key;
5156 struct btrfs_key orig_key;
5157 struct btrfs_disk_key found_key;
5158 int ret;
5159
5160 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5161 orig_key = key;
5162
5163 if (key.offset > 0) {
5164 key.offset--;
5165 } else if (key.type > 0) {
5166 key.type--;
5167 key.offset = (u64)-1;
5168 } else if (key.objectid > 0) {
5169 key.objectid--;
5170 key.type = (u8)-1;
5171 key.offset = (u64)-1;
5172 } else {
5173 return 1;
5174 }
5175
5176 btrfs_release_path(path);
5177 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5178 if (ret <= 0)
5179 return ret;
5180
5181 /*
5182 * Previous key not found. Even if we were at slot 0 of the leaf we had
5183 * before releasing the path and calling btrfs_search_slot(), we now may
5184 * be in a slot pointing to the same original key - this can happen if
5185 * after we released the path, one of more items were moved from a
5186 * sibling leaf into the front of the leaf we had due to an insertion
5187 * (see push_leaf_right()).
5188 * If we hit this case and our slot is > 0 and just decrement the slot
5189 * so that the caller does not process the same key again, which may or
5190 * may not break the caller, depending on its logic.
5191 */
5192 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5193 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
5194 ret = comp_keys(&found_key, &orig_key);
5195 if (ret == 0) {
5196 if (path->slots[0] > 0) {
5197 path->slots[0]--;
5198 return 0;
5199 }
5200 /*
5201 * At slot 0, same key as before, it means orig_key is
5202 * the lowest, leftmost, key in the tree. We're done.
5203 */
5204 return 1;
5205 }
5206 }
5207
5208 btrfs_item_key(path->nodes[0], &found_key, 0);
5209 ret = comp_keys(&found_key, &key);
5210 /*
5211 * We might have had an item with the previous key in the tree right
5212 * before we released our path. And after we released our path, that
5213 * item might have been pushed to the first slot (0) of the leaf we
5214 * were holding due to a tree balance. Alternatively, an item with the
5215 * previous key can exist as the only element of a leaf (big fat item).
5216 * Therefore account for these 2 cases, so that our callers (like
5217 * btrfs_previous_item) don't miss an existing item with a key matching
5218 * the previous key we computed above.
5219 */
5220 if (ret <= 0)
5221 return 0;
5222 return 1;
5223 }
5224
5225 /*
5226 * A helper function to walk down the tree starting at min_key, and looking
5227 * for nodes or leaves that are have a minimum transaction id.
5228 * This is used by the btree defrag code, and tree logging
5229 *
5230 * This does not cow, but it does stuff the starting key it finds back
5231 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5232 * key and get a writable path.
5233 *
5234 * This honors path->lowest_level to prevent descent past a given level
5235 * of the tree.
5236 *
5237 * min_trans indicates the oldest transaction that you are interested
5238 * in walking through. Any nodes or leaves older than min_trans are
5239 * skipped over (without reading them).
5240 *
5241 * returns zero if something useful was found, < 0 on error and 1 if there
5242 * was nothing in the tree that matched the search criteria.
5243 */
btrfs_search_forward(struct btrfs_root * root,struct btrfs_key * min_key,struct btrfs_path * path,u64 min_trans)5244 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5245 struct btrfs_path *path,
5246 u64 min_trans)
5247 {
5248 struct btrfs_fs_info *fs_info = root->fs_info;
5249 struct extent_buffer *cur;
5250 struct btrfs_key found_key;
5251 int slot;
5252 int sret;
5253 u32 nritems;
5254 int level;
5255 int ret = 1;
5256 int keep_locks = path->keep_locks;
5257
5258 path->keep_locks = 1;
5259 again:
5260 cur = btrfs_read_lock_root_node(root);
5261 level = btrfs_header_level(cur);
5262 WARN_ON(path->nodes[level]);
5263 path->nodes[level] = cur;
5264 path->locks[level] = BTRFS_READ_LOCK;
5265
5266 if (btrfs_header_generation(cur) < min_trans) {
5267 ret = 1;
5268 goto out;
5269 }
5270 while (1) {
5271 nritems = btrfs_header_nritems(cur);
5272 level = btrfs_header_level(cur);
5273 sret = btrfs_bin_search(cur, min_key, level, &slot);
5274
5275 /* at the lowest level, we're done, setup the path and exit */
5276 if (level == path->lowest_level) {
5277 if (slot >= nritems)
5278 goto find_next_key;
5279 ret = 0;
5280 path->slots[level] = slot;
5281 btrfs_item_key_to_cpu(cur, &found_key, slot);
5282 goto out;
5283 }
5284 if (sret && slot > 0)
5285 slot--;
5286 /*
5287 * check this node pointer against the min_trans parameters.
5288 * If it is too old, old, skip to the next one.
5289 */
5290 while (slot < nritems) {
5291 u64 gen;
5292
5293 gen = btrfs_node_ptr_generation(cur, slot);
5294 if (gen < min_trans) {
5295 slot++;
5296 continue;
5297 }
5298 break;
5299 }
5300 find_next_key:
5301 /*
5302 * we didn't find a candidate key in this node, walk forward
5303 * and find another one
5304 */
5305 if (slot >= nritems) {
5306 path->slots[level] = slot;
5307 btrfs_set_path_blocking(path);
5308 sret = btrfs_find_next_key(root, path, min_key, level,
5309 min_trans);
5310 if (sret == 0) {
5311 btrfs_release_path(path);
5312 goto again;
5313 } else {
5314 goto out;
5315 }
5316 }
5317 /* save our key for returning back */
5318 btrfs_node_key_to_cpu(cur, &found_key, slot);
5319 path->slots[level] = slot;
5320 if (level == path->lowest_level) {
5321 ret = 0;
5322 goto out;
5323 }
5324 btrfs_set_path_blocking(path);
5325 cur = read_node_slot(fs_info, cur, slot);
5326 if (IS_ERR(cur)) {
5327 ret = PTR_ERR(cur);
5328 goto out;
5329 }
5330
5331 btrfs_tree_read_lock(cur);
5332
5333 path->locks[level - 1] = BTRFS_READ_LOCK;
5334 path->nodes[level - 1] = cur;
5335 unlock_up(path, level, 1, 0, NULL);
5336 btrfs_clear_path_blocking(path, NULL, 0);
5337 }
5338 out:
5339 path->keep_locks = keep_locks;
5340 if (ret == 0) {
5341 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5342 btrfs_set_path_blocking(path);
5343 memcpy(min_key, &found_key, sizeof(found_key));
5344 }
5345 return ret;
5346 }
5347
tree_move_down(struct btrfs_fs_info * fs_info,struct btrfs_path * path,int * level)5348 static int tree_move_down(struct btrfs_fs_info *fs_info,
5349 struct btrfs_path *path,
5350 int *level)
5351 {
5352 struct extent_buffer *eb;
5353
5354 BUG_ON(*level == 0);
5355 eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
5356 if (IS_ERR(eb))
5357 return PTR_ERR(eb);
5358
5359 path->nodes[*level - 1] = eb;
5360 path->slots[*level - 1] = 0;
5361 (*level)--;
5362 return 0;
5363 }
5364
tree_move_next_or_upnext(struct btrfs_path * path,int * level,int root_level)5365 static int tree_move_next_or_upnext(struct btrfs_path *path,
5366 int *level, int root_level)
5367 {
5368 int ret = 0;
5369 int nritems;
5370 nritems = btrfs_header_nritems(path->nodes[*level]);
5371
5372 path->slots[*level]++;
5373
5374 while (path->slots[*level] >= nritems) {
5375 if (*level == root_level)
5376 return -1;
5377
5378 /* move upnext */
5379 path->slots[*level] = 0;
5380 free_extent_buffer(path->nodes[*level]);
5381 path->nodes[*level] = NULL;
5382 (*level)++;
5383 path->slots[*level]++;
5384
5385 nritems = btrfs_header_nritems(path->nodes[*level]);
5386 ret = 1;
5387 }
5388 return ret;
5389 }
5390
5391 /*
5392 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5393 * or down.
5394 */
tree_advance(struct btrfs_fs_info * fs_info,struct btrfs_path * path,int * level,int root_level,int allow_down,struct btrfs_key * key)5395 static int tree_advance(struct btrfs_fs_info *fs_info,
5396 struct btrfs_path *path,
5397 int *level, int root_level,
5398 int allow_down,
5399 struct btrfs_key *key)
5400 {
5401 int ret;
5402
5403 if (*level == 0 || !allow_down) {
5404 ret = tree_move_next_or_upnext(path, level, root_level);
5405 } else {
5406 ret = tree_move_down(fs_info, path, level);
5407 }
5408 if (ret >= 0) {
5409 if (*level == 0)
5410 btrfs_item_key_to_cpu(path->nodes[*level], key,
5411 path->slots[*level]);
5412 else
5413 btrfs_node_key_to_cpu(path->nodes[*level], key,
5414 path->slots[*level]);
5415 }
5416 return ret;
5417 }
5418
tree_compare_item(struct btrfs_path * left_path,struct btrfs_path * right_path,char * tmp_buf)5419 static int tree_compare_item(struct btrfs_path *left_path,
5420 struct btrfs_path *right_path,
5421 char *tmp_buf)
5422 {
5423 int cmp;
5424 int len1, len2;
5425 unsigned long off1, off2;
5426
5427 len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5428 len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5429 if (len1 != len2)
5430 return 1;
5431
5432 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5433 off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5434 right_path->slots[0]);
5435
5436 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5437
5438 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5439 if (cmp)
5440 return 1;
5441 return 0;
5442 }
5443
5444 #define ADVANCE 1
5445 #define ADVANCE_ONLY_NEXT -1
5446
5447 /*
5448 * This function compares two trees and calls the provided callback for
5449 * every changed/new/deleted item it finds.
5450 * If shared tree blocks are encountered, whole subtrees are skipped, making
5451 * the compare pretty fast on snapshotted subvolumes.
5452 *
5453 * This currently works on commit roots only. As commit roots are read only,
5454 * we don't do any locking. The commit roots are protected with transactions.
5455 * Transactions are ended and rejoined when a commit is tried in between.
5456 *
5457 * This function checks for modifications done to the trees while comparing.
5458 * If it detects a change, it aborts immediately.
5459 */
btrfs_compare_trees(struct btrfs_root * left_root,struct btrfs_root * right_root,btrfs_changed_cb_t changed_cb,void * ctx)5460 int btrfs_compare_trees(struct btrfs_root *left_root,
5461 struct btrfs_root *right_root,
5462 btrfs_changed_cb_t changed_cb, void *ctx)
5463 {
5464 struct btrfs_fs_info *fs_info = left_root->fs_info;
5465 int ret;
5466 int cmp;
5467 struct btrfs_path *left_path = NULL;
5468 struct btrfs_path *right_path = NULL;
5469 struct btrfs_key left_key;
5470 struct btrfs_key right_key;
5471 char *tmp_buf = NULL;
5472 int left_root_level;
5473 int right_root_level;
5474 int left_level;
5475 int right_level;
5476 int left_end_reached;
5477 int right_end_reached;
5478 int advance_left;
5479 int advance_right;
5480 u64 left_blockptr;
5481 u64 right_blockptr;
5482 u64 left_gen;
5483 u64 right_gen;
5484
5485 left_path = btrfs_alloc_path();
5486 if (!left_path) {
5487 ret = -ENOMEM;
5488 goto out;
5489 }
5490 right_path = btrfs_alloc_path();
5491 if (!right_path) {
5492 ret = -ENOMEM;
5493 goto out;
5494 }
5495
5496 tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
5497 if (!tmp_buf) {
5498 ret = -ENOMEM;
5499 goto out;
5500 }
5501
5502 left_path->search_commit_root = 1;
5503 left_path->skip_locking = 1;
5504 right_path->search_commit_root = 1;
5505 right_path->skip_locking = 1;
5506
5507 /*
5508 * Strategy: Go to the first items of both trees. Then do
5509 *
5510 * If both trees are at level 0
5511 * Compare keys of current items
5512 * If left < right treat left item as new, advance left tree
5513 * and repeat
5514 * If left > right treat right item as deleted, advance right tree
5515 * and repeat
5516 * If left == right do deep compare of items, treat as changed if
5517 * needed, advance both trees and repeat
5518 * If both trees are at the same level but not at level 0
5519 * Compare keys of current nodes/leafs
5520 * If left < right advance left tree and repeat
5521 * If left > right advance right tree and repeat
5522 * If left == right compare blockptrs of the next nodes/leafs
5523 * If they match advance both trees but stay at the same level
5524 * and repeat
5525 * If they don't match advance both trees while allowing to go
5526 * deeper and repeat
5527 * If tree levels are different
5528 * Advance the tree that needs it and repeat
5529 *
5530 * Advancing a tree means:
5531 * If we are at level 0, try to go to the next slot. If that's not
5532 * possible, go one level up and repeat. Stop when we found a level
5533 * where we could go to the next slot. We may at this point be on a
5534 * node or a leaf.
5535 *
5536 * If we are not at level 0 and not on shared tree blocks, go one
5537 * level deeper.
5538 *
5539 * If we are not at level 0 and on shared tree blocks, go one slot to
5540 * the right if possible or go up and right.
5541 */
5542
5543 down_read(&fs_info->commit_root_sem);
5544 left_level = btrfs_header_level(left_root->commit_root);
5545 left_root_level = left_level;
5546 left_path->nodes[left_level] =
5547 btrfs_clone_extent_buffer(left_root->commit_root);
5548 if (!left_path->nodes[left_level]) {
5549 up_read(&fs_info->commit_root_sem);
5550 ret = -ENOMEM;
5551 goto out;
5552 }
5553 extent_buffer_get(left_path->nodes[left_level]);
5554
5555 right_level = btrfs_header_level(right_root->commit_root);
5556 right_root_level = right_level;
5557 right_path->nodes[right_level] =
5558 btrfs_clone_extent_buffer(right_root->commit_root);
5559 if (!right_path->nodes[right_level]) {
5560 up_read(&fs_info->commit_root_sem);
5561 ret = -ENOMEM;
5562 goto out;
5563 }
5564 extent_buffer_get(right_path->nodes[right_level]);
5565 up_read(&fs_info->commit_root_sem);
5566
5567 if (left_level == 0)
5568 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5569 &left_key, left_path->slots[left_level]);
5570 else
5571 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5572 &left_key, left_path->slots[left_level]);
5573 if (right_level == 0)
5574 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5575 &right_key, right_path->slots[right_level]);
5576 else
5577 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5578 &right_key, right_path->slots[right_level]);
5579
5580 left_end_reached = right_end_reached = 0;
5581 advance_left = advance_right = 0;
5582
5583 while (1) {
5584 cond_resched();
5585 if (advance_left && !left_end_reached) {
5586 ret = tree_advance(fs_info, left_path, &left_level,
5587 left_root_level,
5588 advance_left != ADVANCE_ONLY_NEXT,
5589 &left_key);
5590 if (ret == -1)
5591 left_end_reached = ADVANCE;
5592 else if (ret < 0)
5593 goto out;
5594 advance_left = 0;
5595 }
5596 if (advance_right && !right_end_reached) {
5597 ret = tree_advance(fs_info, right_path, &right_level,
5598 right_root_level,
5599 advance_right != ADVANCE_ONLY_NEXT,
5600 &right_key);
5601 if (ret == -1)
5602 right_end_reached = ADVANCE;
5603 else if (ret < 0)
5604 goto out;
5605 advance_right = 0;
5606 }
5607
5608 if (left_end_reached && right_end_reached) {
5609 ret = 0;
5610 goto out;
5611 } else if (left_end_reached) {
5612 if (right_level == 0) {
5613 ret = changed_cb(left_path, right_path,
5614 &right_key,
5615 BTRFS_COMPARE_TREE_DELETED,
5616 ctx);
5617 if (ret < 0)
5618 goto out;
5619 }
5620 advance_right = ADVANCE;
5621 continue;
5622 } else if (right_end_reached) {
5623 if (left_level == 0) {
5624 ret = changed_cb(left_path, right_path,
5625 &left_key,
5626 BTRFS_COMPARE_TREE_NEW,
5627 ctx);
5628 if (ret < 0)
5629 goto out;
5630 }
5631 advance_left = ADVANCE;
5632 continue;
5633 }
5634
5635 if (left_level == 0 && right_level == 0) {
5636 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5637 if (cmp < 0) {
5638 ret = changed_cb(left_path, right_path,
5639 &left_key,
5640 BTRFS_COMPARE_TREE_NEW,
5641 ctx);
5642 if (ret < 0)
5643 goto out;
5644 advance_left = ADVANCE;
5645 } else if (cmp > 0) {
5646 ret = changed_cb(left_path, right_path,
5647 &right_key,
5648 BTRFS_COMPARE_TREE_DELETED,
5649 ctx);
5650 if (ret < 0)
5651 goto out;
5652 advance_right = ADVANCE;
5653 } else {
5654 enum btrfs_compare_tree_result result;
5655
5656 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5657 ret = tree_compare_item(left_path, right_path,
5658 tmp_buf);
5659 if (ret)
5660 result = BTRFS_COMPARE_TREE_CHANGED;
5661 else
5662 result = BTRFS_COMPARE_TREE_SAME;
5663 ret = changed_cb(left_path, right_path,
5664 &left_key, result, ctx);
5665 if (ret < 0)
5666 goto out;
5667 advance_left = ADVANCE;
5668 advance_right = ADVANCE;
5669 }
5670 } else if (left_level == right_level) {
5671 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5672 if (cmp < 0) {
5673 advance_left = ADVANCE;
5674 } else if (cmp > 0) {
5675 advance_right = ADVANCE;
5676 } else {
5677 left_blockptr = btrfs_node_blockptr(
5678 left_path->nodes[left_level],
5679 left_path->slots[left_level]);
5680 right_blockptr = btrfs_node_blockptr(
5681 right_path->nodes[right_level],
5682 right_path->slots[right_level]);
5683 left_gen = btrfs_node_ptr_generation(
5684 left_path->nodes[left_level],
5685 left_path->slots[left_level]);
5686 right_gen = btrfs_node_ptr_generation(
5687 right_path->nodes[right_level],
5688 right_path->slots[right_level]);
5689 if (left_blockptr == right_blockptr &&
5690 left_gen == right_gen) {
5691 /*
5692 * As we're on a shared block, don't
5693 * allow to go deeper.
5694 */
5695 advance_left = ADVANCE_ONLY_NEXT;
5696 advance_right = ADVANCE_ONLY_NEXT;
5697 } else {
5698 advance_left = ADVANCE;
5699 advance_right = ADVANCE;
5700 }
5701 }
5702 } else if (left_level < right_level) {
5703 advance_right = ADVANCE;
5704 } else {
5705 advance_left = ADVANCE;
5706 }
5707 }
5708
5709 out:
5710 btrfs_free_path(left_path);
5711 btrfs_free_path(right_path);
5712 kvfree(tmp_buf);
5713 return ret;
5714 }
5715
5716 /*
5717 * this is similar to btrfs_next_leaf, but does not try to preserve
5718 * and fixup the path. It looks for and returns the next key in the
5719 * tree based on the current path and the min_trans parameters.
5720 *
5721 * 0 is returned if another key is found, < 0 if there are any errors
5722 * and 1 is returned if there are no higher keys in the tree
5723 *
5724 * path->keep_locks should be set to 1 on the search made before
5725 * calling this function.
5726 */
btrfs_find_next_key(struct btrfs_root * root,struct btrfs_path * path,struct btrfs_key * key,int level,u64 min_trans)5727 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5728 struct btrfs_key *key, int level, u64 min_trans)
5729 {
5730 int slot;
5731 struct extent_buffer *c;
5732
5733 WARN_ON(!path->keep_locks);
5734 while (level < BTRFS_MAX_LEVEL) {
5735 if (!path->nodes[level])
5736 return 1;
5737
5738 slot = path->slots[level] + 1;
5739 c = path->nodes[level];
5740 next:
5741 if (slot >= btrfs_header_nritems(c)) {
5742 int ret;
5743 int orig_lowest;
5744 struct btrfs_key cur_key;
5745 if (level + 1 >= BTRFS_MAX_LEVEL ||
5746 !path->nodes[level + 1])
5747 return 1;
5748
5749 if (path->locks[level + 1]) {
5750 level++;
5751 continue;
5752 }
5753
5754 slot = btrfs_header_nritems(c) - 1;
5755 if (level == 0)
5756 btrfs_item_key_to_cpu(c, &cur_key, slot);
5757 else
5758 btrfs_node_key_to_cpu(c, &cur_key, slot);
5759
5760 orig_lowest = path->lowest_level;
5761 btrfs_release_path(path);
5762 path->lowest_level = level;
5763 ret = btrfs_search_slot(NULL, root, &cur_key, path,
5764 0, 0);
5765 path->lowest_level = orig_lowest;
5766 if (ret < 0)
5767 return ret;
5768
5769 c = path->nodes[level];
5770 slot = path->slots[level];
5771 if (ret == 0)
5772 slot++;
5773 goto next;
5774 }
5775
5776 if (level == 0)
5777 btrfs_item_key_to_cpu(c, key, slot);
5778 else {
5779 u64 gen = btrfs_node_ptr_generation(c, slot);
5780
5781 if (gen < min_trans) {
5782 slot++;
5783 goto next;
5784 }
5785 btrfs_node_key_to_cpu(c, key, slot);
5786 }
5787 return 0;
5788 }
5789 return 1;
5790 }
5791
5792 /*
5793 * search the tree again to find a leaf with greater keys
5794 * returns 0 if it found something or 1 if there are no greater leaves.
5795 * returns < 0 on io errors.
5796 */
btrfs_next_leaf(struct btrfs_root * root,struct btrfs_path * path)5797 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5798 {
5799 return btrfs_next_old_leaf(root, path, 0);
5800 }
5801
btrfs_next_old_leaf(struct btrfs_root * root,struct btrfs_path * path,u64 time_seq)5802 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5803 u64 time_seq)
5804 {
5805 int slot;
5806 int level;
5807 struct extent_buffer *c;
5808 struct extent_buffer *next;
5809 struct btrfs_key key;
5810 u32 nritems;
5811 int ret;
5812 int old_spinning = path->leave_spinning;
5813 int next_rw_lock = 0;
5814
5815 nritems = btrfs_header_nritems(path->nodes[0]);
5816 if (nritems == 0)
5817 return 1;
5818
5819 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5820 again:
5821 level = 1;
5822 next = NULL;
5823 next_rw_lock = 0;
5824 btrfs_release_path(path);
5825
5826 path->keep_locks = 1;
5827 path->leave_spinning = 1;
5828
5829 if (time_seq)
5830 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5831 else
5832 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5833 path->keep_locks = 0;
5834
5835 if (ret < 0)
5836 return ret;
5837
5838 nritems = btrfs_header_nritems(path->nodes[0]);
5839 /*
5840 * by releasing the path above we dropped all our locks. A balance
5841 * could have added more items next to the key that used to be
5842 * at the very end of the block. So, check again here and
5843 * advance the path if there are now more items available.
5844 */
5845 if (nritems > 0 && path->slots[0] < nritems - 1) {
5846 if (ret == 0)
5847 path->slots[0]++;
5848 ret = 0;
5849 goto done;
5850 }
5851 /*
5852 * So the above check misses one case:
5853 * - after releasing the path above, someone has removed the item that
5854 * used to be at the very end of the block, and balance between leafs
5855 * gets another one with bigger key.offset to replace it.
5856 *
5857 * This one should be returned as well, or we can get leaf corruption
5858 * later(esp. in __btrfs_drop_extents()).
5859 *
5860 * And a bit more explanation about this check,
5861 * with ret > 0, the key isn't found, the path points to the slot
5862 * where it should be inserted, so the path->slots[0] item must be the
5863 * bigger one.
5864 */
5865 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5866 ret = 0;
5867 goto done;
5868 }
5869
5870 while (level < BTRFS_MAX_LEVEL) {
5871 if (!path->nodes[level]) {
5872 ret = 1;
5873 goto done;
5874 }
5875
5876 slot = path->slots[level] + 1;
5877 c = path->nodes[level];
5878 if (slot >= btrfs_header_nritems(c)) {
5879 level++;
5880 if (level == BTRFS_MAX_LEVEL) {
5881 ret = 1;
5882 goto done;
5883 }
5884 continue;
5885 }
5886
5887 if (next) {
5888 btrfs_tree_unlock_rw(next, next_rw_lock);
5889 free_extent_buffer(next);
5890 }
5891
5892 next = c;
5893 next_rw_lock = path->locks[level];
5894 ret = read_block_for_search(root, path, &next, level,
5895 slot, &key);
5896 if (ret == -EAGAIN)
5897 goto again;
5898
5899 if (ret < 0) {
5900 btrfs_release_path(path);
5901 goto done;
5902 }
5903
5904 if (!path->skip_locking) {
5905 ret = btrfs_try_tree_read_lock(next);
5906 if (!ret && time_seq) {
5907 /*
5908 * If we don't get the lock, we may be racing
5909 * with push_leaf_left, holding that lock while
5910 * itself waiting for the leaf we've currently
5911 * locked. To solve this situation, we give up
5912 * on our lock and cycle.
5913 */
5914 free_extent_buffer(next);
5915 btrfs_release_path(path);
5916 cond_resched();
5917 goto again;
5918 }
5919 if (!ret) {
5920 btrfs_set_path_blocking(path);
5921 btrfs_tree_read_lock(next);
5922 btrfs_clear_path_blocking(path, next,
5923 BTRFS_READ_LOCK);
5924 }
5925 next_rw_lock = BTRFS_READ_LOCK;
5926 }
5927 break;
5928 }
5929 path->slots[level] = slot;
5930 while (1) {
5931 level--;
5932 c = path->nodes[level];
5933 if (path->locks[level])
5934 btrfs_tree_unlock_rw(c, path->locks[level]);
5935
5936 free_extent_buffer(c);
5937 path->nodes[level] = next;
5938 path->slots[level] = 0;
5939 if (!path->skip_locking)
5940 path->locks[level] = next_rw_lock;
5941 if (!level)
5942 break;
5943
5944 ret = read_block_for_search(root, path, &next, level,
5945 0, &key);
5946 if (ret == -EAGAIN)
5947 goto again;
5948
5949 if (ret < 0) {
5950 btrfs_release_path(path);
5951 goto done;
5952 }
5953
5954 if (!path->skip_locking) {
5955 ret = btrfs_try_tree_read_lock(next);
5956 if (!ret) {
5957 btrfs_set_path_blocking(path);
5958 btrfs_tree_read_lock(next);
5959 btrfs_clear_path_blocking(path, next,
5960 BTRFS_READ_LOCK);
5961 }
5962 next_rw_lock = BTRFS_READ_LOCK;
5963 }
5964 }
5965 ret = 0;
5966 done:
5967 unlock_up(path, 0, 1, 0, NULL);
5968 path->leave_spinning = old_spinning;
5969 if (!old_spinning)
5970 btrfs_set_path_blocking(path);
5971
5972 return ret;
5973 }
5974
5975 /*
5976 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5977 * searching until it gets past min_objectid or finds an item of 'type'
5978 *
5979 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5980 */
btrfs_previous_item(struct btrfs_root * root,struct btrfs_path * path,u64 min_objectid,int type)5981 int btrfs_previous_item(struct btrfs_root *root,
5982 struct btrfs_path *path, u64 min_objectid,
5983 int type)
5984 {
5985 struct btrfs_key found_key;
5986 struct extent_buffer *leaf;
5987 u32 nritems;
5988 int ret;
5989
5990 while (1) {
5991 if (path->slots[0] == 0) {
5992 btrfs_set_path_blocking(path);
5993 ret = btrfs_prev_leaf(root, path);
5994 if (ret != 0)
5995 return ret;
5996 } else {
5997 path->slots[0]--;
5998 }
5999 leaf = path->nodes[0];
6000 nritems = btrfs_header_nritems(leaf);
6001 if (nritems == 0)
6002 return 1;
6003 if (path->slots[0] == nritems)
6004 path->slots[0]--;
6005
6006 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6007 if (found_key.objectid < min_objectid)
6008 break;
6009 if (found_key.type == type)
6010 return 0;
6011 if (found_key.objectid == min_objectid &&
6012 found_key.type < type)
6013 break;
6014 }
6015 return 1;
6016 }
6017
6018 /*
6019 * search in extent tree to find a previous Metadata/Data extent item with
6020 * min objecitd.
6021 *
6022 * returns 0 if something is found, 1 if nothing was found and < 0 on error
6023 */
btrfs_previous_extent_item(struct btrfs_root * root,struct btrfs_path * path,u64 min_objectid)6024 int btrfs_previous_extent_item(struct btrfs_root *root,
6025 struct btrfs_path *path, u64 min_objectid)
6026 {
6027 struct btrfs_key found_key;
6028 struct extent_buffer *leaf;
6029 u32 nritems;
6030 int ret;
6031
6032 while (1) {
6033 if (path->slots[0] == 0) {
6034 btrfs_set_path_blocking(path);
6035 ret = btrfs_prev_leaf(root, path);
6036 if (ret != 0)
6037 return ret;
6038 } else {
6039 path->slots[0]--;
6040 }
6041 leaf = path->nodes[0];
6042 nritems = btrfs_header_nritems(leaf);
6043 if (nritems == 0)
6044 return 1;
6045 if (path->slots[0] == nritems)
6046 path->slots[0]--;
6047
6048 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6049 if (found_key.objectid < min_objectid)
6050 break;
6051 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
6052 found_key.type == BTRFS_METADATA_ITEM_KEY)
6053 return 0;
6054 if (found_key.objectid == min_objectid &&
6055 found_key.type < BTRFS_EXTENT_ITEM_KEY)
6056 break;
6057 }
6058 return 1;
6059 }
6060