1 /*
2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
3  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
4  *
5  * This copyrighted material is made available to anyone wishing to use,
6  * modify, copy, or redistribute it subject to the terms and conditions
7  * of the GNU General Public License version 2.
8  */
9 
10 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11 
12 #include <linux/slab.h>
13 #include <linux/spinlock.h>
14 #include <linux/completion.h>
15 #include <linux/buffer_head.h>
16 #include <linux/fs.h>
17 #include <linux/gfs2_ondisk.h>
18 #include <linux/prefetch.h>
19 #include <linux/blkdev.h>
20 #include <linux/rbtree.h>
21 #include <linux/random.h>
22 
23 #include "gfs2.h"
24 #include "incore.h"
25 #include "glock.h"
26 #include "glops.h"
27 #include "lops.h"
28 #include "meta_io.h"
29 #include "quota.h"
30 #include "rgrp.h"
31 #include "super.h"
32 #include "trans.h"
33 #include "util.h"
34 #include "log.h"
35 #include "inode.h"
36 #include "trace_gfs2.h"
37 #include "dir.h"
38 
39 #define BFITNOENT ((u32)~0)
40 #define NO_BLOCK ((u64)~0)
41 
42 #if BITS_PER_LONG == 32
43 #define LBITMASK   (0x55555555UL)
44 #define LBITSKIP55 (0x55555555UL)
45 #define LBITSKIP00 (0x00000000UL)
46 #else
47 #define LBITMASK   (0x5555555555555555UL)
48 #define LBITSKIP55 (0x5555555555555555UL)
49 #define LBITSKIP00 (0x0000000000000000UL)
50 #endif
51 
52 /*
53  * These routines are used by the resource group routines (rgrp.c)
54  * to keep track of block allocation.  Each block is represented by two
55  * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
56  *
57  * 0 = Free
58  * 1 = Used (not metadata)
59  * 2 = Unlinked (still in use) inode
60  * 3 = Used (metadata)
61  */
62 
63 struct gfs2_extent {
64 	struct gfs2_rbm rbm;
65 	u32 len;
66 };
67 
68 static const char valid_change[16] = {
69 	        /* current */
70 	/* n */ 0, 1, 1, 1,
71 	/* e */ 1, 0, 0, 0,
72 	/* w */ 0, 0, 0, 1,
73 	        1, 0, 0, 0
74 };
75 
76 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
77 			 const struct gfs2_inode *ip, bool nowrap);
78 
79 
80 /**
81  * gfs2_setbit - Set a bit in the bitmaps
82  * @rbm: The position of the bit to set
83  * @do_clone: Also set the clone bitmap, if it exists
84  * @new_state: the new state of the block
85  *
86  */
87 
gfs2_setbit(const struct gfs2_rbm * rbm,bool do_clone,unsigned char new_state)88 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
89 			       unsigned char new_state)
90 {
91 	unsigned char *byte1, *byte2, *end, cur_state;
92 	struct gfs2_bitmap *bi = rbm_bi(rbm);
93 	unsigned int buflen = bi->bi_len;
94 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
95 
96 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
97 	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
98 
99 	BUG_ON(byte1 >= end);
100 
101 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
102 
103 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
104 		pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n",
105 			rbm->offset, cur_state, new_state);
106 		pr_warn("rgrp=0x%llx bi_start=0x%x\n",
107 			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
108 		pr_warn("bi_offset=0x%x bi_len=0x%x\n",
109 			bi->bi_offset, bi->bi_len);
110 		dump_stack();
111 		gfs2_consist_rgrpd(rbm->rgd);
112 		return;
113 	}
114 	*byte1 ^= (cur_state ^ new_state) << bit;
115 
116 	if (do_clone && bi->bi_clone) {
117 		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
118 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
119 		*byte2 ^= (cur_state ^ new_state) << bit;
120 	}
121 }
122 
123 /**
124  * gfs2_testbit - test a bit in the bitmaps
125  * @rbm: The bit to test
126  * @use_clone: If true, test the clone bitmap, not the official bitmap.
127  *
128  * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
129  * not the "real" bitmaps, to avoid allocating recently freed blocks.
130  *
131  * Returns: The two bit block state of the requested bit
132  */
133 
gfs2_testbit(const struct gfs2_rbm * rbm,bool use_clone)134 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
135 {
136 	struct gfs2_bitmap *bi = rbm_bi(rbm);
137 	const u8 *buffer;
138 	const u8 *byte;
139 	unsigned int bit;
140 
141 	if (use_clone && bi->bi_clone)
142 		buffer = bi->bi_clone;
143 	else
144 		buffer = bi->bi_bh->b_data;
145 	buffer += bi->bi_offset;
146 	byte = buffer + (rbm->offset / GFS2_NBBY);
147 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
148 
149 	return (*byte >> bit) & GFS2_BIT_MASK;
150 }
151 
152 /**
153  * gfs2_bit_search
154  * @ptr: Pointer to bitmap data
155  * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
156  * @state: The state we are searching for
157  *
158  * We xor the bitmap data with a patter which is the bitwise opposite
159  * of what we are looking for, this gives rise to a pattern of ones
160  * wherever there is a match. Since we have two bits per entry, we
161  * take this pattern, shift it down by one place and then and it with
162  * the original. All the even bit positions (0,2,4, etc) then represent
163  * successful matches, so we mask with 0x55555..... to remove the unwanted
164  * odd bit positions.
165  *
166  * This allows searching of a whole u64 at once (32 blocks) with a
167  * single test (on 64 bit arches).
168  */
169 
gfs2_bit_search(const __le64 * ptr,u64 mask,u8 state)170 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
171 {
172 	u64 tmp;
173 	static const u64 search[] = {
174 		[0] = 0xffffffffffffffffULL,
175 		[1] = 0xaaaaaaaaaaaaaaaaULL,
176 		[2] = 0x5555555555555555ULL,
177 		[3] = 0x0000000000000000ULL,
178 	};
179 	tmp = le64_to_cpu(*ptr) ^ search[state];
180 	tmp &= (tmp >> 1);
181 	tmp &= mask;
182 	return tmp;
183 }
184 
185 /**
186  * rs_cmp - multi-block reservation range compare
187  * @blk: absolute file system block number of the new reservation
188  * @len: number of blocks in the new reservation
189  * @rs: existing reservation to compare against
190  *
191  * returns: 1 if the block range is beyond the reach of the reservation
192  *         -1 if the block range is before the start of the reservation
193  *          0 if the block range overlaps with the reservation
194  */
rs_cmp(u64 blk,u32 len,struct gfs2_blkreserv * rs)195 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
196 {
197 	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
198 
199 	if (blk >= startblk + rs->rs_free)
200 		return 1;
201 	if (blk + len - 1 < startblk)
202 		return -1;
203 	return 0;
204 }
205 
206 /**
207  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
208  *       a block in a given allocation state.
209  * @buf: the buffer that holds the bitmaps
210  * @len: the length (in bytes) of the buffer
211  * @goal: start search at this block's bit-pair (within @buffer)
212  * @state: GFS2_BLKST_XXX the state of the block we're looking for.
213  *
214  * Scope of @goal and returned block number is only within this bitmap buffer,
215  * not entire rgrp or filesystem.  @buffer will be offset from the actual
216  * beginning of a bitmap block buffer, skipping any header structures, but
217  * headers are always a multiple of 64 bits long so that the buffer is
218  * always aligned to a 64 bit boundary.
219  *
220  * The size of the buffer is in bytes, but is it assumed that it is
221  * always ok to read a complete multiple of 64 bits at the end
222  * of the block in case the end is no aligned to a natural boundary.
223  *
224  * Return: the block number (bitmap buffer scope) that was found
225  */
226 
gfs2_bitfit(const u8 * buf,const unsigned int len,u32 goal,u8 state)227 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
228 		       u32 goal, u8 state)
229 {
230 	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
231 	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
232 	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
233 	u64 tmp;
234 	u64 mask = 0x5555555555555555ULL;
235 	u32 bit;
236 
237 	/* Mask off bits we don't care about at the start of the search */
238 	mask <<= spoint;
239 	tmp = gfs2_bit_search(ptr, mask, state);
240 	ptr++;
241 	while(tmp == 0 && ptr < end) {
242 		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
243 		ptr++;
244 	}
245 	/* Mask off any bits which are more than len bytes from the start */
246 	if (ptr == end && (len & (sizeof(u64) - 1)))
247 		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
248 	/* Didn't find anything, so return */
249 	if (tmp == 0)
250 		return BFITNOENT;
251 	ptr--;
252 	bit = __ffs64(tmp);
253 	bit /= 2;	/* two bits per entry in the bitmap */
254 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
255 }
256 
257 /**
258  * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
259  * @rbm: The rbm with rgd already set correctly
260  * @block: The block number (filesystem relative)
261  *
262  * This sets the bi and offset members of an rbm based on a
263  * resource group and a filesystem relative block number. The
264  * resource group must be set in the rbm on entry, the bi and
265  * offset members will be set by this function.
266  *
267  * Returns: 0 on success, or an error code
268  */
269 
gfs2_rbm_from_block(struct gfs2_rbm * rbm,u64 block)270 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
271 {
272 	u64 rblock = block - rbm->rgd->rd_data0;
273 
274 	if (WARN_ON_ONCE(rblock > UINT_MAX))
275 		return -EINVAL;
276 	if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
277 		return -E2BIG;
278 
279 	rbm->bii = 0;
280 	rbm->offset = (u32)(rblock);
281 	/* Check if the block is within the first block */
282 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
283 		return 0;
284 
285 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
286 	rbm->offset += (sizeof(struct gfs2_rgrp) -
287 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
288 	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
289 	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
290 	return 0;
291 }
292 
293 /**
294  * gfs2_rbm_incr - increment an rbm structure
295  * @rbm: The rbm with rgd already set correctly
296  *
297  * This function takes an existing rbm structure and increments it to the next
298  * viable block offset.
299  *
300  * Returns: If incrementing the offset would cause the rbm to go past the
301  *          end of the rgrp, true is returned, otherwise false.
302  *
303  */
304 
gfs2_rbm_incr(struct gfs2_rbm * rbm)305 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
306 {
307 	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
308 		rbm->offset++;
309 		return false;
310 	}
311 	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
312 		return true;
313 
314 	rbm->offset = 0;
315 	rbm->bii++;
316 	return false;
317 }
318 
319 /**
320  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
321  * @rbm: Position to search (value/result)
322  * @n_unaligned: Number of unaligned blocks to check
323  * @len: Decremented for each block found (terminate on zero)
324  *
325  * Returns: true if a non-free block is encountered
326  */
327 
gfs2_unaligned_extlen(struct gfs2_rbm * rbm,u32 n_unaligned,u32 * len)328 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
329 {
330 	u32 n;
331 	u8 res;
332 
333 	for (n = 0; n < n_unaligned; n++) {
334 		res = gfs2_testbit(rbm, true);
335 		if (res != GFS2_BLKST_FREE)
336 			return true;
337 		(*len)--;
338 		if (*len == 0)
339 			return true;
340 		if (gfs2_rbm_incr(rbm))
341 			return true;
342 	}
343 
344 	return false;
345 }
346 
347 /**
348  * gfs2_free_extlen - Return extent length of free blocks
349  * @rrbm: Starting position
350  * @len: Max length to check
351  *
352  * Starting at the block specified by the rbm, see how many free blocks
353  * there are, not reading more than len blocks ahead. This can be done
354  * using memchr_inv when the blocks are byte aligned, but has to be done
355  * on a block by block basis in case of unaligned blocks. Also this
356  * function can cope with bitmap boundaries (although it must stop on
357  * a resource group boundary)
358  *
359  * Returns: Number of free blocks in the extent
360  */
361 
gfs2_free_extlen(const struct gfs2_rbm * rrbm,u32 len)362 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
363 {
364 	struct gfs2_rbm rbm = *rrbm;
365 	u32 n_unaligned = rbm.offset & 3;
366 	u32 size = len;
367 	u32 bytes;
368 	u32 chunk_size;
369 	u8 *ptr, *start, *end;
370 	u64 block;
371 	struct gfs2_bitmap *bi;
372 
373 	if (n_unaligned &&
374 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
375 		goto out;
376 
377 	n_unaligned = len & 3;
378 	/* Start is now byte aligned */
379 	while (len > 3) {
380 		bi = rbm_bi(&rbm);
381 		start = bi->bi_bh->b_data;
382 		if (bi->bi_clone)
383 			start = bi->bi_clone;
384 		start += bi->bi_offset;
385 		end = start + bi->bi_len;
386 		BUG_ON(rbm.offset & 3);
387 		start += (rbm.offset / GFS2_NBBY);
388 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
389 		ptr = memchr_inv(start, 0, bytes);
390 		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
391 		chunk_size *= GFS2_NBBY;
392 		BUG_ON(len < chunk_size);
393 		len -= chunk_size;
394 		block = gfs2_rbm_to_block(&rbm);
395 		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
396 			n_unaligned = 0;
397 			break;
398 		}
399 		if (ptr) {
400 			n_unaligned = 3;
401 			break;
402 		}
403 		n_unaligned = len & 3;
404 	}
405 
406 	/* Deal with any bits left over at the end */
407 	if (n_unaligned)
408 		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
409 out:
410 	return size - len;
411 }
412 
413 /**
414  * gfs2_bitcount - count the number of bits in a certain state
415  * @rgd: the resource group descriptor
416  * @buffer: the buffer that holds the bitmaps
417  * @buflen: the length (in bytes) of the buffer
418  * @state: the state of the block we're looking for
419  *
420  * Returns: The number of bits
421  */
422 
gfs2_bitcount(struct gfs2_rgrpd * rgd,const u8 * buffer,unsigned int buflen,u8 state)423 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
424 			 unsigned int buflen, u8 state)
425 {
426 	const u8 *byte = buffer;
427 	const u8 *end = buffer + buflen;
428 	const u8 state1 = state << 2;
429 	const u8 state2 = state << 4;
430 	const u8 state3 = state << 6;
431 	u32 count = 0;
432 
433 	for (; byte < end; byte++) {
434 		if (((*byte) & 0x03) == state)
435 			count++;
436 		if (((*byte) & 0x0C) == state1)
437 			count++;
438 		if (((*byte) & 0x30) == state2)
439 			count++;
440 		if (((*byte) & 0xC0) == state3)
441 			count++;
442 	}
443 
444 	return count;
445 }
446 
447 /**
448  * gfs2_rgrp_verify - Verify that a resource group is consistent
449  * @rgd: the rgrp
450  *
451  */
452 
gfs2_rgrp_verify(struct gfs2_rgrpd * rgd)453 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
454 {
455 	struct gfs2_sbd *sdp = rgd->rd_sbd;
456 	struct gfs2_bitmap *bi = NULL;
457 	u32 length = rgd->rd_length;
458 	u32 count[4], tmp;
459 	int buf, x;
460 
461 	memset(count, 0, 4 * sizeof(u32));
462 
463 	/* Count # blocks in each of 4 possible allocation states */
464 	for (buf = 0; buf < length; buf++) {
465 		bi = rgd->rd_bits + buf;
466 		for (x = 0; x < 4; x++)
467 			count[x] += gfs2_bitcount(rgd,
468 						  bi->bi_bh->b_data +
469 						  bi->bi_offset,
470 						  bi->bi_len, x);
471 	}
472 
473 	if (count[0] != rgd->rd_free) {
474 		if (gfs2_consist_rgrpd(rgd))
475 			fs_err(sdp, "free data mismatch:  %u != %u\n",
476 			       count[0], rgd->rd_free);
477 		return;
478 	}
479 
480 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
481 	if (count[1] != tmp) {
482 		if (gfs2_consist_rgrpd(rgd))
483 			fs_err(sdp, "used data mismatch:  %u != %u\n",
484 			       count[1], tmp);
485 		return;
486 	}
487 
488 	if (count[2] + count[3] != rgd->rd_dinodes) {
489 		if (gfs2_consist_rgrpd(rgd))
490 			fs_err(sdp, "used metadata mismatch:  %u != %u\n",
491 			       count[2] + count[3], rgd->rd_dinodes);
492 		return;
493 	}
494 }
495 
496 /**
497  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
498  * @sdp: The GFS2 superblock
499  * @blk: The data block number
500  * @exact: True if this needs to be an exact match
501  *
502  * The @exact argument should be set to true by most callers. The exception
503  * is when we need to match blocks which are not represented by the rgrp
504  * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
505  * there for alignment purposes. Another way of looking at it is that @exact
506  * matches only valid data/metadata blocks, but with @exact false, it will
507  * match any block within the extent of the rgrp.
508  *
509  * Returns: The resource group, or NULL if not found
510  */
511 
gfs2_blk2rgrpd(struct gfs2_sbd * sdp,u64 blk,bool exact)512 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
513 {
514 	struct rb_node *n, *next;
515 	struct gfs2_rgrpd *cur;
516 
517 	spin_lock(&sdp->sd_rindex_spin);
518 	n = sdp->sd_rindex_tree.rb_node;
519 	while (n) {
520 		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
521 		next = NULL;
522 		if (blk < cur->rd_addr)
523 			next = n->rb_left;
524 		else if (blk >= cur->rd_data0 + cur->rd_data)
525 			next = n->rb_right;
526 		if (next == NULL) {
527 			spin_unlock(&sdp->sd_rindex_spin);
528 			if (exact) {
529 				if (blk < cur->rd_addr)
530 					return NULL;
531 				if (blk >= cur->rd_data0 + cur->rd_data)
532 					return NULL;
533 			}
534 			return cur;
535 		}
536 		n = next;
537 	}
538 	spin_unlock(&sdp->sd_rindex_spin);
539 
540 	return NULL;
541 }
542 
543 /**
544  * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
545  * @sdp: The GFS2 superblock
546  *
547  * Returns: The first rgrp in the filesystem
548  */
549 
gfs2_rgrpd_get_first(struct gfs2_sbd * sdp)550 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
551 {
552 	const struct rb_node *n;
553 	struct gfs2_rgrpd *rgd;
554 
555 	spin_lock(&sdp->sd_rindex_spin);
556 	n = rb_first(&sdp->sd_rindex_tree);
557 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
558 	spin_unlock(&sdp->sd_rindex_spin);
559 
560 	return rgd;
561 }
562 
563 /**
564  * gfs2_rgrpd_get_next - get the next RG
565  * @rgd: the resource group descriptor
566  *
567  * Returns: The next rgrp
568  */
569 
gfs2_rgrpd_get_next(struct gfs2_rgrpd * rgd)570 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
571 {
572 	struct gfs2_sbd *sdp = rgd->rd_sbd;
573 	const struct rb_node *n;
574 
575 	spin_lock(&sdp->sd_rindex_spin);
576 	n = rb_next(&rgd->rd_node);
577 	if (n == NULL)
578 		n = rb_first(&sdp->sd_rindex_tree);
579 
580 	if (unlikely(&rgd->rd_node == n)) {
581 		spin_unlock(&sdp->sd_rindex_spin);
582 		return NULL;
583 	}
584 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
585 	spin_unlock(&sdp->sd_rindex_spin);
586 	return rgd;
587 }
588 
check_and_update_goal(struct gfs2_inode * ip)589 void check_and_update_goal(struct gfs2_inode *ip)
590 {
591 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
592 	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
593 		ip->i_goal = ip->i_no_addr;
594 }
595 
gfs2_free_clones(struct gfs2_rgrpd * rgd)596 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
597 {
598 	int x;
599 
600 	for (x = 0; x < rgd->rd_length; x++) {
601 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
602 		kfree(bi->bi_clone);
603 		bi->bi_clone = NULL;
604 	}
605 }
606 
607 /**
608  * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
609  *                 plus a quota allocations data structure, if necessary
610  * @ip: the inode for this reservation
611  */
gfs2_rsqa_alloc(struct gfs2_inode * ip)612 int gfs2_rsqa_alloc(struct gfs2_inode *ip)
613 {
614 	return gfs2_qa_alloc(ip);
615 }
616 
dump_rs(struct seq_file * seq,const struct gfs2_blkreserv * rs)617 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
618 {
619 	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
620 
621 	gfs2_print_dbg(seq, "  B: n:%llu s:%llu b:%u f:%u\n",
622 		       (unsigned long long)ip->i_no_addr,
623 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
624 		       rs->rs_rbm.offset, rs->rs_free);
625 }
626 
627 /**
628  * __rs_deltree - remove a multi-block reservation from the rgd tree
629  * @rs: The reservation to remove
630  *
631  */
__rs_deltree(struct gfs2_blkreserv * rs)632 static void __rs_deltree(struct gfs2_blkreserv *rs)
633 {
634 	struct gfs2_rgrpd *rgd;
635 
636 	if (!gfs2_rs_active(rs))
637 		return;
638 
639 	rgd = rs->rs_rbm.rgd;
640 	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
641 	rb_erase(&rs->rs_node, &rgd->rd_rstree);
642 	RB_CLEAR_NODE(&rs->rs_node);
643 
644 	if (rs->rs_free) {
645 		u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
646 				 rs->rs_free - 1;
647 		struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
648 		struct gfs2_bitmap *start, *last;
649 
650 		/* return reserved blocks to the rgrp */
651 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
652 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
653 		/* The rgrp extent failure point is likely not to increase;
654 		   it will only do so if the freed blocks are somehow
655 		   contiguous with a span of free blocks that follows. Still,
656 		   it will force the number to be recalculated later. */
657 		rgd->rd_extfail_pt += rs->rs_free;
658 		rs->rs_free = 0;
659 		if (gfs2_rbm_from_block(&last_rbm, last_block))
660 			return;
661 		start = rbm_bi(&rs->rs_rbm);
662 		last = rbm_bi(&last_rbm);
663 		do
664 			clear_bit(GBF_FULL, &start->bi_flags);
665 		while (start++ != last);
666 	}
667 }
668 
669 /**
670  * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
671  * @rs: The reservation to remove
672  *
673  */
gfs2_rs_deltree(struct gfs2_blkreserv * rs)674 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
675 {
676 	struct gfs2_rgrpd *rgd;
677 
678 	rgd = rs->rs_rbm.rgd;
679 	if (rgd) {
680 		spin_lock(&rgd->rd_rsspin);
681 		__rs_deltree(rs);
682 		BUG_ON(rs->rs_free);
683 		spin_unlock(&rgd->rd_rsspin);
684 	}
685 }
686 
687 /**
688  * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
689  * @ip: The inode for this reservation
690  * @wcount: The inode's write count, or NULL
691  *
692  */
gfs2_rsqa_delete(struct gfs2_inode * ip,atomic_t * wcount)693 void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
694 {
695 	down_write(&ip->i_rw_mutex);
696 	if ((wcount == NULL) || (atomic_read(wcount) <= 1))
697 		gfs2_rs_deltree(&ip->i_res);
698 	up_write(&ip->i_rw_mutex);
699 	gfs2_qa_delete(ip, wcount);
700 }
701 
702 /**
703  * return_all_reservations - return all reserved blocks back to the rgrp.
704  * @rgd: the rgrp that needs its space back
705  *
706  * We previously reserved a bunch of blocks for allocation. Now we need to
707  * give them back. This leave the reservation structures in tact, but removes
708  * all of their corresponding "no-fly zones".
709  */
return_all_reservations(struct gfs2_rgrpd * rgd)710 static void return_all_reservations(struct gfs2_rgrpd *rgd)
711 {
712 	struct rb_node *n;
713 	struct gfs2_blkreserv *rs;
714 
715 	spin_lock(&rgd->rd_rsspin);
716 	while ((n = rb_first(&rgd->rd_rstree))) {
717 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
718 		__rs_deltree(rs);
719 	}
720 	spin_unlock(&rgd->rd_rsspin);
721 }
722 
gfs2_clear_rgrpd(struct gfs2_sbd * sdp)723 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
724 {
725 	struct rb_node *n;
726 	struct gfs2_rgrpd *rgd;
727 	struct gfs2_glock *gl;
728 
729 	while ((n = rb_first(&sdp->sd_rindex_tree))) {
730 		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
731 		gl = rgd->rd_gl;
732 
733 		rb_erase(n, &sdp->sd_rindex_tree);
734 
735 		if (gl) {
736 			glock_clear_object(gl, rgd);
737 			gfs2_rgrp_brelse(rgd);
738 			gfs2_glock_put(gl);
739 		}
740 
741 		gfs2_free_clones(rgd);
742 		return_all_reservations(rgd);
743 		kfree(rgd->rd_bits);
744 		rgd->rd_bits = NULL;
745 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
746 	}
747 }
748 
gfs2_rindex_print(const struct gfs2_rgrpd * rgd)749 static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
750 {
751 	pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
752 	pr_info("ri_length = %u\n", rgd->rd_length);
753 	pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
754 	pr_info("ri_data = %u\n", rgd->rd_data);
755 	pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes);
756 }
757 
758 /**
759  * gfs2_compute_bitstructs - Compute the bitmap sizes
760  * @rgd: The resource group descriptor
761  *
762  * Calculates bitmap descriptors, one for each block that contains bitmap data
763  *
764  * Returns: errno
765  */
766 
compute_bitstructs(struct gfs2_rgrpd * rgd)767 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
768 {
769 	struct gfs2_sbd *sdp = rgd->rd_sbd;
770 	struct gfs2_bitmap *bi;
771 	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
772 	u32 bytes_left, bytes;
773 	int x;
774 
775 	if (!length)
776 		return -EINVAL;
777 
778 	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
779 	if (!rgd->rd_bits)
780 		return -ENOMEM;
781 
782 	bytes_left = rgd->rd_bitbytes;
783 
784 	for (x = 0; x < length; x++) {
785 		bi = rgd->rd_bits + x;
786 
787 		bi->bi_flags = 0;
788 		/* small rgrp; bitmap stored completely in header block */
789 		if (length == 1) {
790 			bytes = bytes_left;
791 			bi->bi_offset = sizeof(struct gfs2_rgrp);
792 			bi->bi_start = 0;
793 			bi->bi_len = bytes;
794 			bi->bi_blocks = bytes * GFS2_NBBY;
795 		/* header block */
796 		} else if (x == 0) {
797 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
798 			bi->bi_offset = sizeof(struct gfs2_rgrp);
799 			bi->bi_start = 0;
800 			bi->bi_len = bytes;
801 			bi->bi_blocks = bytes * GFS2_NBBY;
802 		/* last block */
803 		} else if (x + 1 == length) {
804 			bytes = bytes_left;
805 			bi->bi_offset = sizeof(struct gfs2_meta_header);
806 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
807 			bi->bi_len = bytes;
808 			bi->bi_blocks = bytes * GFS2_NBBY;
809 		/* other blocks */
810 		} else {
811 			bytes = sdp->sd_sb.sb_bsize -
812 				sizeof(struct gfs2_meta_header);
813 			bi->bi_offset = sizeof(struct gfs2_meta_header);
814 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
815 			bi->bi_len = bytes;
816 			bi->bi_blocks = bytes * GFS2_NBBY;
817 		}
818 
819 		bytes_left -= bytes;
820 	}
821 
822 	if (bytes_left) {
823 		gfs2_consist_rgrpd(rgd);
824 		return -EIO;
825 	}
826 	bi = rgd->rd_bits + (length - 1);
827 	if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_data) {
828 		if (gfs2_consist_rgrpd(rgd)) {
829 			gfs2_rindex_print(rgd);
830 			fs_err(sdp, "start=%u len=%u offset=%u\n",
831 			       bi->bi_start, bi->bi_len, bi->bi_offset);
832 		}
833 		return -EIO;
834 	}
835 
836 	return 0;
837 }
838 
839 /**
840  * gfs2_ri_total - Total up the file system space, according to the rindex.
841  * @sdp: the filesystem
842  *
843  */
gfs2_ri_total(struct gfs2_sbd * sdp)844 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
845 {
846 	u64 total_data = 0;
847 	struct inode *inode = sdp->sd_rindex;
848 	struct gfs2_inode *ip = GFS2_I(inode);
849 	char buf[sizeof(struct gfs2_rindex)];
850 	int error, rgrps;
851 
852 	for (rgrps = 0;; rgrps++) {
853 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
854 
855 		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
856 			break;
857 		error = gfs2_internal_read(ip, buf, &pos,
858 					   sizeof(struct gfs2_rindex));
859 		if (error != sizeof(struct gfs2_rindex))
860 			break;
861 		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
862 	}
863 	return total_data;
864 }
865 
rgd_insert(struct gfs2_rgrpd * rgd)866 static int rgd_insert(struct gfs2_rgrpd *rgd)
867 {
868 	struct gfs2_sbd *sdp = rgd->rd_sbd;
869 	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
870 
871 	/* Figure out where to put new node */
872 	while (*newn) {
873 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
874 						  rd_node);
875 
876 		parent = *newn;
877 		if (rgd->rd_addr < cur->rd_addr)
878 			newn = &((*newn)->rb_left);
879 		else if (rgd->rd_addr > cur->rd_addr)
880 			newn = &((*newn)->rb_right);
881 		else
882 			return -EEXIST;
883 	}
884 
885 	rb_link_node(&rgd->rd_node, parent, newn);
886 	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
887 	sdp->sd_rgrps++;
888 	return 0;
889 }
890 
891 /**
892  * read_rindex_entry - Pull in a new resource index entry from the disk
893  * @ip: Pointer to the rindex inode
894  *
895  * Returns: 0 on success, > 0 on EOF, error code otherwise
896  */
897 
read_rindex_entry(struct gfs2_inode * ip)898 static int read_rindex_entry(struct gfs2_inode *ip)
899 {
900 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
901 	const unsigned bsize = sdp->sd_sb.sb_bsize;
902 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
903 	struct gfs2_rindex buf;
904 	int error;
905 	struct gfs2_rgrpd *rgd;
906 
907 	if (pos >= i_size_read(&ip->i_inode))
908 		return 1;
909 
910 	error = gfs2_internal_read(ip, (char *)&buf, &pos,
911 				   sizeof(struct gfs2_rindex));
912 
913 	if (error != sizeof(struct gfs2_rindex))
914 		return (error == 0) ? 1 : error;
915 
916 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
917 	error = -ENOMEM;
918 	if (!rgd)
919 		return error;
920 
921 	rgd->rd_sbd = sdp;
922 	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
923 	rgd->rd_length = be32_to_cpu(buf.ri_length);
924 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
925 	rgd->rd_data = be32_to_cpu(buf.ri_data);
926 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
927 	spin_lock_init(&rgd->rd_rsspin);
928 
929 	error = gfs2_glock_get(sdp, rgd->rd_addr,
930 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
931 	if (error)
932 		goto fail;
933 
934 	error = compute_bitstructs(rgd);
935 	if (error)
936 		goto fail_glock;
937 
938 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
939 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
940 	if (rgd->rd_data > sdp->sd_max_rg_data)
941 		sdp->sd_max_rg_data = rgd->rd_data;
942 	spin_lock(&sdp->sd_rindex_spin);
943 	error = rgd_insert(rgd);
944 	spin_unlock(&sdp->sd_rindex_spin);
945 	if (!error) {
946 		glock_set_object(rgd->rd_gl, rgd);
947 		rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
948 		rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr +
949 						    rgd->rd_length) * bsize) - 1;
950 		return 0;
951 	}
952 
953 	error = 0; /* someone else read in the rgrp; free it and ignore it */
954 fail_glock:
955 	gfs2_glock_put(rgd->rd_gl);
956 
957 fail:
958 	kfree(rgd->rd_bits);
959 	rgd->rd_bits = NULL;
960 	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
961 	return error;
962 }
963 
964 /**
965  * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
966  * @sdp: the GFS2 superblock
967  *
968  * The purpose of this function is to select a subset of the resource groups
969  * and mark them as PREFERRED. We do it in such a way that each node prefers
970  * to use a unique set of rgrps to minimize glock contention.
971  */
set_rgrp_preferences(struct gfs2_sbd * sdp)972 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
973 {
974 	struct gfs2_rgrpd *rgd, *first;
975 	int i;
976 
977 	/* Skip an initial number of rgrps, based on this node's journal ID.
978 	   That should start each node out on its own set. */
979 	rgd = gfs2_rgrpd_get_first(sdp);
980 	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
981 		rgd = gfs2_rgrpd_get_next(rgd);
982 	first = rgd;
983 
984 	do {
985 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
986 		for (i = 0; i < sdp->sd_journals; i++) {
987 			rgd = gfs2_rgrpd_get_next(rgd);
988 			if (!rgd || rgd == first)
989 				break;
990 		}
991 	} while (rgd && rgd != first);
992 }
993 
994 /**
995  * gfs2_ri_update - Pull in a new resource index from the disk
996  * @ip: pointer to the rindex inode
997  *
998  * Returns: 0 on successful update, error code otherwise
999  */
1000 
gfs2_ri_update(struct gfs2_inode * ip)1001 static int gfs2_ri_update(struct gfs2_inode *ip)
1002 {
1003 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1004 	int error;
1005 
1006 	do {
1007 		error = read_rindex_entry(ip);
1008 	} while (error == 0);
1009 
1010 	if (error < 0)
1011 		return error;
1012 
1013 	if (RB_EMPTY_ROOT(&sdp->sd_rindex_tree)) {
1014 		fs_err(sdp, "no resource groups found in the file system.\n");
1015 		return -ENOENT;
1016 	}
1017 	set_rgrp_preferences(sdp);
1018 
1019 	sdp->sd_rindex_uptodate = 1;
1020 	return 0;
1021 }
1022 
1023 /**
1024  * gfs2_rindex_update - Update the rindex if required
1025  * @sdp: The GFS2 superblock
1026  *
1027  * We grab a lock on the rindex inode to make sure that it doesn't
1028  * change whilst we are performing an operation. We keep this lock
1029  * for quite long periods of time compared to other locks. This
1030  * doesn't matter, since it is shared and it is very, very rarely
1031  * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1032  *
1033  * This makes sure that we're using the latest copy of the resource index
1034  * special file, which might have been updated if someone expanded the
1035  * filesystem (via gfs2_grow utility), which adds new resource groups.
1036  *
1037  * Returns: 0 on succeess, error code otherwise
1038  */
1039 
gfs2_rindex_update(struct gfs2_sbd * sdp)1040 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1041 {
1042 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1043 	struct gfs2_glock *gl = ip->i_gl;
1044 	struct gfs2_holder ri_gh;
1045 	int error = 0;
1046 	int unlock_required = 0;
1047 
1048 	/* Read new copy from disk if we don't have the latest */
1049 	if (!sdp->sd_rindex_uptodate) {
1050 		if (!gfs2_glock_is_locked_by_me(gl)) {
1051 			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1052 			if (error)
1053 				return error;
1054 			unlock_required = 1;
1055 		}
1056 		if (!sdp->sd_rindex_uptodate)
1057 			error = gfs2_ri_update(ip);
1058 		if (unlock_required)
1059 			gfs2_glock_dq_uninit(&ri_gh);
1060 	}
1061 
1062 	return error;
1063 }
1064 
gfs2_rgrp_in(struct gfs2_rgrpd * rgd,const void * buf)1065 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1066 {
1067 	const struct gfs2_rgrp *str = buf;
1068 	u32 rg_flags;
1069 
1070 	rg_flags = be32_to_cpu(str->rg_flags);
1071 	rg_flags &= ~GFS2_RDF_MASK;
1072 	rgd->rd_flags &= GFS2_RDF_MASK;
1073 	rgd->rd_flags |= rg_flags;
1074 	rgd->rd_free = be32_to_cpu(str->rg_free);
1075 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1076 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1077 	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
1078 }
1079 
gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb * rgl,const void * buf)1080 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1081 {
1082 	const struct gfs2_rgrp *str = buf;
1083 
1084 	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1085 	rgl->rl_flags = str->rg_flags;
1086 	rgl->rl_free = str->rg_free;
1087 	rgl->rl_dinodes = str->rg_dinodes;
1088 	rgl->rl_igeneration = str->rg_igeneration;
1089 	rgl->__pad = 0UL;
1090 }
1091 
gfs2_rgrp_out(struct gfs2_rgrpd * rgd,void * buf)1092 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1093 {
1094 	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1095 	struct gfs2_rgrp *str = buf;
1096 	u32 crc;
1097 
1098 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1099 	str->rg_free = cpu_to_be32(rgd->rd_free);
1100 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1101 	if (next == NULL)
1102 		str->rg_skip = 0;
1103 	else if (next->rd_addr > rgd->rd_addr)
1104 		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1105 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1106 	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1107 	str->rg_data = cpu_to_be32(rgd->rd_data);
1108 	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1109 	str->rg_crc = 0;
1110 	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1111 	str->rg_crc = cpu_to_be32(crc);
1112 
1113 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1114 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1115 }
1116 
gfs2_rgrp_lvb_valid(struct gfs2_rgrpd * rgd)1117 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1118 {
1119 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1120 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1121 
1122 	if (rgl->rl_flags != str->rg_flags || rgl->rl_free != str->rg_free ||
1123 	    rgl->rl_dinodes != str->rg_dinodes ||
1124 	    rgl->rl_igeneration != str->rg_igeneration)
1125 		return 0;
1126 	return 1;
1127 }
1128 
count_unlinked(struct gfs2_rgrpd * rgd)1129 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1130 {
1131 	struct gfs2_bitmap *bi;
1132 	const u32 length = rgd->rd_length;
1133 	const u8 *buffer = NULL;
1134 	u32 i, goal, count = 0;
1135 
1136 	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1137 		goal = 0;
1138 		buffer = bi->bi_bh->b_data + bi->bi_offset;
1139 		WARN_ON(!buffer_uptodate(bi->bi_bh));
1140 		while (goal < bi->bi_len * GFS2_NBBY) {
1141 			goal = gfs2_bitfit(buffer, bi->bi_len, goal,
1142 					   GFS2_BLKST_UNLINKED);
1143 			if (goal == BFITNOENT)
1144 				break;
1145 			count++;
1146 			goal++;
1147 		}
1148 	}
1149 
1150 	return count;
1151 }
1152 
1153 
1154 /**
1155  * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1156  * @rgd: the struct gfs2_rgrpd describing the RG to read in
1157  *
1158  * Read in all of a Resource Group's header and bitmap blocks.
1159  * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
1160  *
1161  * Returns: errno
1162  */
1163 
gfs2_rgrp_bh_get(struct gfs2_rgrpd * rgd)1164 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1165 {
1166 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1167 	struct gfs2_glock *gl = rgd->rd_gl;
1168 	unsigned int length = rgd->rd_length;
1169 	struct gfs2_bitmap *bi;
1170 	unsigned int x, y;
1171 	int error;
1172 
1173 	if (rgd->rd_bits[0].bi_bh != NULL)
1174 		return 0;
1175 
1176 	for (x = 0; x < length; x++) {
1177 		bi = rgd->rd_bits + x;
1178 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1179 		if (error)
1180 			goto fail;
1181 	}
1182 
1183 	for (y = length; y--;) {
1184 		bi = rgd->rd_bits + y;
1185 		error = gfs2_meta_wait(sdp, bi->bi_bh);
1186 		if (error)
1187 			goto fail;
1188 		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1189 					      GFS2_METATYPE_RG)) {
1190 			error = -EIO;
1191 			goto fail;
1192 		}
1193 	}
1194 
1195 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1196 		for (x = 0; x < length; x++)
1197 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1198 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1199 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1200 		rgd->rd_free_clone = rgd->rd_free;
1201 		/* max out the rgrp allocation failure point */
1202 		rgd->rd_extfail_pt = rgd->rd_free;
1203 	}
1204 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1205 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1206 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1207 				     rgd->rd_bits[0].bi_bh->b_data);
1208 	}
1209 	else if (sdp->sd_args.ar_rgrplvb) {
1210 		if (!gfs2_rgrp_lvb_valid(rgd)){
1211 			gfs2_consist_rgrpd(rgd);
1212 			error = -EIO;
1213 			goto fail;
1214 		}
1215 		if (rgd->rd_rgl->rl_unlinked == 0)
1216 			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1217 	}
1218 	return 0;
1219 
1220 fail:
1221 	while (x--) {
1222 		bi = rgd->rd_bits + x;
1223 		brelse(bi->bi_bh);
1224 		bi->bi_bh = NULL;
1225 		gfs2_assert_warn(sdp, !bi->bi_clone);
1226 	}
1227 
1228 	return error;
1229 }
1230 
update_rgrp_lvb(struct gfs2_rgrpd * rgd)1231 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1232 {
1233 	u32 rl_flags;
1234 
1235 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1236 		return 0;
1237 
1238 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1239 		return gfs2_rgrp_bh_get(rgd);
1240 
1241 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1242 	rl_flags &= ~GFS2_RDF_MASK;
1243 	rgd->rd_flags &= GFS2_RDF_MASK;
1244 	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
1245 	if (rgd->rd_rgl->rl_unlinked == 0)
1246 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1247 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1248 	rgd->rd_free_clone = rgd->rd_free;
1249 	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1250 	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1251 	return 0;
1252 }
1253 
gfs2_rgrp_go_lock(struct gfs2_holder * gh)1254 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1255 {
1256 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1257 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1258 
1259 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1260 		return 0;
1261 	return gfs2_rgrp_bh_get(rgd);
1262 }
1263 
1264 /**
1265  * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1266  * @rgd: The resource group
1267  *
1268  */
1269 
gfs2_rgrp_brelse(struct gfs2_rgrpd * rgd)1270 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1271 {
1272 	int x, length = rgd->rd_length;
1273 
1274 	for (x = 0; x < length; x++) {
1275 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1276 		if (bi->bi_bh) {
1277 			brelse(bi->bi_bh);
1278 			bi->bi_bh = NULL;
1279 		}
1280 	}
1281 
1282 }
1283 
1284 /**
1285  * gfs2_rgrp_go_unlock - Unlock a rgrp glock
1286  * @gh: The glock holder for the resource group
1287  *
1288  */
1289 
gfs2_rgrp_go_unlock(struct gfs2_holder * gh)1290 void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
1291 {
1292 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1293 	int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) |
1294 		test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags);
1295 
1296 	if (rgd && demote_requested)
1297 		gfs2_rgrp_brelse(rgd);
1298 }
1299 
gfs2_rgrp_send_discards(struct gfs2_sbd * sdp,u64 offset,struct buffer_head * bh,const struct gfs2_bitmap * bi,unsigned minlen,u64 * ptrimmed)1300 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1301 			     struct buffer_head *bh,
1302 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1303 {
1304 	struct super_block *sb = sdp->sd_vfs;
1305 	u64 blk;
1306 	sector_t start = 0;
1307 	sector_t nr_blks = 0;
1308 	int rv;
1309 	unsigned int x;
1310 	u32 trimmed = 0;
1311 	u8 diff;
1312 
1313 	for (x = 0; x < bi->bi_len; x++) {
1314 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1315 		clone += bi->bi_offset;
1316 		clone += x;
1317 		if (bh) {
1318 			const u8 *orig = bh->b_data + bi->bi_offset + x;
1319 			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1320 		} else {
1321 			diff = ~(*clone | (*clone >> 1));
1322 		}
1323 		diff &= 0x55;
1324 		if (diff == 0)
1325 			continue;
1326 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1327 		while(diff) {
1328 			if (diff & 1) {
1329 				if (nr_blks == 0)
1330 					goto start_new_extent;
1331 				if ((start + nr_blks) != blk) {
1332 					if (nr_blks >= minlen) {
1333 						rv = sb_issue_discard(sb,
1334 							start, nr_blks,
1335 							GFP_NOFS, 0);
1336 						if (rv)
1337 							goto fail;
1338 						trimmed += nr_blks;
1339 					}
1340 					nr_blks = 0;
1341 start_new_extent:
1342 					start = blk;
1343 				}
1344 				nr_blks++;
1345 			}
1346 			diff >>= 2;
1347 			blk++;
1348 		}
1349 	}
1350 	if (nr_blks >= minlen) {
1351 		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1352 		if (rv)
1353 			goto fail;
1354 		trimmed += nr_blks;
1355 	}
1356 	if (ptrimmed)
1357 		*ptrimmed = trimmed;
1358 	return 0;
1359 
1360 fail:
1361 	if (sdp->sd_args.ar_discard)
1362 		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1363 	sdp->sd_args.ar_discard = 0;
1364 	return -EIO;
1365 }
1366 
1367 /**
1368  * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1369  * @filp: Any file on the filesystem
1370  * @argp: Pointer to the arguments (also used to pass result)
1371  *
1372  * Returns: 0 on success, otherwise error code
1373  */
1374 
gfs2_fitrim(struct file * filp,void __user * argp)1375 int gfs2_fitrim(struct file *filp, void __user *argp)
1376 {
1377 	struct inode *inode = file_inode(filp);
1378 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1379 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1380 	struct buffer_head *bh;
1381 	struct gfs2_rgrpd *rgd;
1382 	struct gfs2_rgrpd *rgd_end;
1383 	struct gfs2_holder gh;
1384 	struct fstrim_range r;
1385 	int ret = 0;
1386 	u64 amt;
1387 	u64 trimmed = 0;
1388 	u64 start, end, minlen;
1389 	unsigned int x;
1390 	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1391 
1392 	if (!capable(CAP_SYS_ADMIN))
1393 		return -EPERM;
1394 
1395 	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
1396 		return -EROFS;
1397 
1398 	if (!blk_queue_discard(q))
1399 		return -EOPNOTSUPP;
1400 
1401 	if (copy_from_user(&r, argp, sizeof(r)))
1402 		return -EFAULT;
1403 
1404 	ret = gfs2_rindex_update(sdp);
1405 	if (ret)
1406 		return ret;
1407 
1408 	start = r.start >> bs_shift;
1409 	end = start + (r.len >> bs_shift);
1410 	minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize);
1411 	minlen = max_t(u64, minlen,
1412 		       q->limits.discard_granularity) >> bs_shift;
1413 
1414 	if (end <= start || minlen > sdp->sd_max_rg_data)
1415 		return -EINVAL;
1416 
1417 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1418 	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1419 
1420 	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1421 	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1422 		return -EINVAL; /* start is beyond the end of the fs */
1423 
1424 	while (1) {
1425 
1426 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1427 		if (ret)
1428 			goto out;
1429 
1430 		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1431 			/* Trim each bitmap in the rgrp */
1432 			for (x = 0; x < rgd->rd_length; x++) {
1433 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1434 				ret = gfs2_rgrp_send_discards(sdp,
1435 						rgd->rd_data0, NULL, bi, minlen,
1436 						&amt);
1437 				if (ret) {
1438 					gfs2_glock_dq_uninit(&gh);
1439 					goto out;
1440 				}
1441 				trimmed += amt;
1442 			}
1443 
1444 			/* Mark rgrp as having been trimmed */
1445 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1446 			if (ret == 0) {
1447 				bh = rgd->rd_bits[0].bi_bh;
1448 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1449 				gfs2_trans_add_meta(rgd->rd_gl, bh);
1450 				gfs2_rgrp_out(rgd, bh->b_data);
1451 				gfs2_trans_end(sdp);
1452 			}
1453 		}
1454 		gfs2_glock_dq_uninit(&gh);
1455 
1456 		if (rgd == rgd_end)
1457 			break;
1458 
1459 		rgd = gfs2_rgrpd_get_next(rgd);
1460 	}
1461 
1462 out:
1463 	r.len = trimmed << bs_shift;
1464 	if (copy_to_user(argp, &r, sizeof(r)))
1465 		return -EFAULT;
1466 
1467 	return ret;
1468 }
1469 
1470 /**
1471  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1472  * @ip: the inode structure
1473  *
1474  */
rs_insert(struct gfs2_inode * ip)1475 static void rs_insert(struct gfs2_inode *ip)
1476 {
1477 	struct rb_node **newn, *parent = NULL;
1478 	int rc;
1479 	struct gfs2_blkreserv *rs = &ip->i_res;
1480 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1481 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1482 
1483 	BUG_ON(gfs2_rs_active(rs));
1484 
1485 	spin_lock(&rgd->rd_rsspin);
1486 	newn = &rgd->rd_rstree.rb_node;
1487 	while (*newn) {
1488 		struct gfs2_blkreserv *cur =
1489 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1490 
1491 		parent = *newn;
1492 		rc = rs_cmp(fsblock, rs->rs_free, cur);
1493 		if (rc > 0)
1494 			newn = &((*newn)->rb_right);
1495 		else if (rc < 0)
1496 			newn = &((*newn)->rb_left);
1497 		else {
1498 			spin_unlock(&rgd->rd_rsspin);
1499 			WARN_ON(1);
1500 			return;
1501 		}
1502 	}
1503 
1504 	rb_link_node(&rs->rs_node, parent, newn);
1505 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1506 
1507 	/* Do our rgrp accounting for the reservation */
1508 	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1509 	spin_unlock(&rgd->rd_rsspin);
1510 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1511 }
1512 
1513 /**
1514  * rgd_free - return the number of free blocks we can allocate.
1515  * @rgd: the resource group
1516  *
1517  * This function returns the number of free blocks for an rgrp.
1518  * That's the clone-free blocks (blocks that are free, not including those
1519  * still being used for unlinked files that haven't been deleted.)
1520  *
1521  * It also subtracts any blocks reserved by someone else, but does not
1522  * include free blocks that are still part of our current reservation,
1523  * because obviously we can (and will) allocate them.
1524  */
rgd_free(struct gfs2_rgrpd * rgd,struct gfs2_blkreserv * rs)1525 static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1526 {
1527 	u32 tot_reserved, tot_free;
1528 
1529 	if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
1530 		return 0;
1531 	tot_reserved = rgd->rd_reserved - rs->rs_free;
1532 
1533 	if (rgd->rd_free_clone < tot_reserved)
1534 		tot_reserved = 0;
1535 
1536 	tot_free = rgd->rd_free_clone - tot_reserved;
1537 
1538 	return tot_free;
1539 }
1540 
1541 /**
1542  * rg_mblk_search - find a group of multiple free blocks to form a reservation
1543  * @rgd: the resource group descriptor
1544  * @ip: pointer to the inode for which we're reserving blocks
1545  * @ap: the allocation parameters
1546  *
1547  */
1548 
rg_mblk_search(struct gfs2_rgrpd * rgd,struct gfs2_inode * ip,const struct gfs2_alloc_parms * ap)1549 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1550 			   const struct gfs2_alloc_parms *ap)
1551 {
1552 	struct gfs2_rbm rbm = { .rgd = rgd, };
1553 	u64 goal;
1554 	struct gfs2_blkreserv *rs = &ip->i_res;
1555 	u32 extlen;
1556 	u32 free_blocks = rgd_free(rgd, rs);
1557 	int ret;
1558 	struct inode *inode = &ip->i_inode;
1559 
1560 	if (S_ISDIR(inode->i_mode))
1561 		extlen = 1;
1562 	else {
1563 		extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target);
1564 		extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
1565 	}
1566 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1567 		return;
1568 
1569 	/* Find bitmap block that contains bits for goal block */
1570 	if (rgrp_contains_block(rgd, ip->i_goal))
1571 		goal = ip->i_goal;
1572 	else
1573 		goal = rgd->rd_last_alloc + rgd->rd_data0;
1574 
1575 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1576 		return;
1577 
1578 	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1579 	if (ret == 0) {
1580 		rs->rs_rbm = rbm;
1581 		rs->rs_free = extlen;
1582 		rs_insert(ip);
1583 	} else {
1584 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1585 			rgd->rd_last_alloc = 0;
1586 	}
1587 }
1588 
1589 /**
1590  * gfs2_next_unreserved_block - Return next block that is not reserved
1591  * @rgd: The resource group
1592  * @block: The starting block
1593  * @length: The required length
1594  * @ip: Ignore any reservations for this inode
1595  *
1596  * If the block does not appear in any reservation, then return the
1597  * block number unchanged. If it does appear in the reservation, then
1598  * keep looking through the tree of reservations in order to find the
1599  * first block number which is not reserved.
1600  */
1601 
gfs2_next_unreserved_block(struct gfs2_rgrpd * rgd,u64 block,u32 length,const struct gfs2_inode * ip)1602 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1603 				      u32 length,
1604 				      const struct gfs2_inode *ip)
1605 {
1606 	struct gfs2_blkreserv *rs;
1607 	struct rb_node *n;
1608 	int rc;
1609 
1610 	spin_lock(&rgd->rd_rsspin);
1611 	n = rgd->rd_rstree.rb_node;
1612 	while (n) {
1613 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1614 		rc = rs_cmp(block, length, rs);
1615 		if (rc < 0)
1616 			n = n->rb_left;
1617 		else if (rc > 0)
1618 			n = n->rb_right;
1619 		else
1620 			break;
1621 	}
1622 
1623 	if (n) {
1624 		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1625 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1626 			n = n->rb_right;
1627 			if (n == NULL)
1628 				break;
1629 			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1630 		}
1631 	}
1632 
1633 	spin_unlock(&rgd->rd_rsspin);
1634 	return block;
1635 }
1636 
1637 /**
1638  * gfs2_reservation_check_and_update - Check for reservations during block alloc
1639  * @rbm: The current position in the resource group
1640  * @ip: The inode for which we are searching for blocks
1641  * @minext: The minimum extent length
1642  * @maxext: A pointer to the maximum extent structure
1643  *
1644  * This checks the current position in the rgrp to see whether there is
1645  * a reservation covering this block. If not then this function is a
1646  * no-op. If there is, then the position is moved to the end of the
1647  * contiguous reservation(s) so that we are pointing at the first
1648  * non-reserved block.
1649  *
1650  * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1651  */
1652 
gfs2_reservation_check_and_update(struct gfs2_rbm * rbm,const struct gfs2_inode * ip,u32 minext,struct gfs2_extent * maxext)1653 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1654 					     const struct gfs2_inode *ip,
1655 					     u32 minext,
1656 					     struct gfs2_extent *maxext)
1657 {
1658 	u64 block = gfs2_rbm_to_block(rbm);
1659 	u32 extlen = 1;
1660 	u64 nblock;
1661 	int ret;
1662 
1663 	/*
1664 	 * If we have a minimum extent length, then skip over any extent
1665 	 * which is less than the min extent length in size.
1666 	 */
1667 	if (minext) {
1668 		extlen = gfs2_free_extlen(rbm, minext);
1669 		if (extlen <= maxext->len)
1670 			goto fail;
1671 	}
1672 
1673 	/*
1674 	 * Check the extent which has been found against the reservations
1675 	 * and skip if parts of it are already reserved
1676 	 */
1677 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1678 	if (nblock == block) {
1679 		if (!minext || extlen >= minext)
1680 			return 0;
1681 
1682 		if (extlen > maxext->len) {
1683 			maxext->len = extlen;
1684 			maxext->rbm = *rbm;
1685 		}
1686 fail:
1687 		nblock = block + extlen;
1688 	}
1689 	ret = gfs2_rbm_from_block(rbm, nblock);
1690 	if (ret < 0)
1691 		return ret;
1692 	return 1;
1693 }
1694 
1695 /**
1696  * gfs2_rbm_find - Look for blocks of a particular state
1697  * @rbm: Value/result starting position and final position
1698  * @state: The state which we want to find
1699  * @minext: Pointer to the requested extent length (NULL for a single block)
1700  *          This is updated to be the actual reservation size.
1701  * @ip: If set, check for reservations
1702  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1703  *          around until we've reached the starting point.
1704  *
1705  * Side effects:
1706  * - If looking for free blocks, we set GBF_FULL on each bitmap which
1707  *   has no free blocks in it.
1708  * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1709  *   has come up short on a free block search.
1710  *
1711  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1712  */
1713 
gfs2_rbm_find(struct gfs2_rbm * rbm,u8 state,u32 * minext,const struct gfs2_inode * ip,bool nowrap)1714 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1715 			 const struct gfs2_inode *ip, bool nowrap)
1716 {
1717 	struct buffer_head *bh;
1718 	int initial_bii;
1719 	u32 initial_offset;
1720 	int first_bii = rbm->bii;
1721 	u32 first_offset = rbm->offset;
1722 	u32 offset;
1723 	u8 *buffer;
1724 	int n = 0;
1725 	int iters = rbm->rgd->rd_length;
1726 	int ret;
1727 	struct gfs2_bitmap *bi;
1728 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1729 
1730 	/* If we are not starting at the beginning of a bitmap, then we
1731 	 * need to add one to the bitmap count to ensure that we search
1732 	 * the starting bitmap twice.
1733 	 */
1734 	if (rbm->offset != 0)
1735 		iters++;
1736 
1737 	while(1) {
1738 		bi = rbm_bi(rbm);
1739 		if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) &&
1740 		    test_bit(GBF_FULL, &bi->bi_flags) &&
1741 		    (state == GFS2_BLKST_FREE))
1742 			goto next_bitmap;
1743 
1744 		bh = bi->bi_bh;
1745 		buffer = bh->b_data + bi->bi_offset;
1746 		WARN_ON(!buffer_uptodate(bh));
1747 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1748 			buffer = bi->bi_clone + bi->bi_offset;
1749 		initial_offset = rbm->offset;
1750 		offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state);
1751 		if (offset == BFITNOENT)
1752 			goto bitmap_full;
1753 		rbm->offset = offset;
1754 		if (ip == NULL)
1755 			return 0;
1756 
1757 		initial_bii = rbm->bii;
1758 		ret = gfs2_reservation_check_and_update(rbm, ip,
1759 							minext ? *minext : 0,
1760 							&maxext);
1761 		if (ret == 0)
1762 			return 0;
1763 		if (ret > 0) {
1764 			n += (rbm->bii - initial_bii);
1765 			goto next_iter;
1766 		}
1767 		if (ret == -E2BIG) {
1768 			rbm->bii = 0;
1769 			rbm->offset = 0;
1770 			n += (rbm->bii - initial_bii);
1771 			goto res_covered_end_of_rgrp;
1772 		}
1773 		return ret;
1774 
1775 bitmap_full:	/* Mark bitmap as full and fall through */
1776 		if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
1777 			set_bit(GBF_FULL, &bi->bi_flags);
1778 
1779 next_bitmap:	/* Find next bitmap in the rgrp */
1780 		rbm->offset = 0;
1781 		rbm->bii++;
1782 		if (rbm->bii == rbm->rgd->rd_length)
1783 			rbm->bii = 0;
1784 res_covered_end_of_rgrp:
1785 		if ((rbm->bii == 0) && nowrap)
1786 			break;
1787 		n++;
1788 next_iter:
1789 		if (n >= iters)
1790 			break;
1791 	}
1792 
1793 	if (minext == NULL || state != GFS2_BLKST_FREE)
1794 		return -ENOSPC;
1795 
1796 	/* If the extent was too small, and it's smaller than the smallest
1797 	   to have failed before, remember for future reference that it's
1798 	   useless to search this rgrp again for this amount or more. */
1799 	if ((first_offset == 0) && (first_bii == 0) &&
1800 	    (*minext < rbm->rgd->rd_extfail_pt))
1801 		rbm->rgd->rd_extfail_pt = *minext;
1802 
1803 	/* If the maximum extent we found is big enough to fulfill the
1804 	   minimum requirements, use it anyway. */
1805 	if (maxext.len) {
1806 		*rbm = maxext.rbm;
1807 		*minext = maxext.len;
1808 		return 0;
1809 	}
1810 
1811 	return -ENOSPC;
1812 }
1813 
1814 /**
1815  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1816  * @rgd: The rgrp
1817  * @last_unlinked: block address of the last dinode we unlinked
1818  * @skip: block address we should explicitly not unlink
1819  *
1820  * Returns: 0 if no error
1821  *          The inode, if one has been found, in inode.
1822  */
1823 
try_rgrp_unlink(struct gfs2_rgrpd * rgd,u64 * last_unlinked,u64 skip)1824 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1825 {
1826 	u64 block;
1827 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1828 	struct gfs2_glock *gl;
1829 	struct gfs2_inode *ip;
1830 	int error;
1831 	int found = 0;
1832 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1833 
1834 	while (1) {
1835 		down_write(&sdp->sd_log_flush_lock);
1836 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1837 				      true);
1838 		up_write(&sdp->sd_log_flush_lock);
1839 		if (error == -ENOSPC)
1840 			break;
1841 		if (WARN_ON_ONCE(error))
1842 			break;
1843 
1844 		block = gfs2_rbm_to_block(&rbm);
1845 		if (gfs2_rbm_from_block(&rbm, block + 1))
1846 			break;
1847 		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1848 			continue;
1849 		if (block == skip)
1850 			continue;
1851 		*last_unlinked = block;
1852 
1853 		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1854 		if (error)
1855 			continue;
1856 
1857 		/* If the inode is already in cache, we can ignore it here
1858 		 * because the existing inode disposal code will deal with
1859 		 * it when all refs have gone away. Accessing gl_object like
1860 		 * this is not safe in general. Here it is ok because we do
1861 		 * not dereference the pointer, and we only need an approx
1862 		 * answer to whether it is NULL or not.
1863 		 */
1864 		ip = gl->gl_object;
1865 
1866 		if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
1867 			gfs2_glock_put(gl);
1868 		else
1869 			found++;
1870 
1871 		/* Limit reclaim to sensible number of tasks */
1872 		if (found > NR_CPUS)
1873 			return;
1874 	}
1875 
1876 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1877 	return;
1878 }
1879 
1880 /**
1881  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1882  * @rgd: The rgrp in question
1883  * @loops: An indication of how picky we can be (0=very, 1=less so)
1884  *
1885  * This function uses the recently added glock statistics in order to
1886  * figure out whether a parciular resource group is suffering from
1887  * contention from multiple nodes. This is done purely on the basis
1888  * of timings, since this is the only data we have to work with and
1889  * our aim here is to reject a resource group which is highly contended
1890  * but (very important) not to do this too often in order to ensure that
1891  * we do not land up introducing fragmentation by changing resource
1892  * groups when not actually required.
1893  *
1894  * The calculation is fairly simple, we want to know whether the SRTTB
1895  * (i.e. smoothed round trip time for blocking operations) to acquire
1896  * the lock for this rgrp's glock is significantly greater than the
1897  * time taken for resource groups on average. We introduce a margin in
1898  * the form of the variable @var which is computed as the sum of the two
1899  * respective variences, and multiplied by a factor depending on @loops
1900  * and whether we have a lot of data to base the decision on. This is
1901  * then tested against the square difference of the means in order to
1902  * decide whether the result is statistically significant or not.
1903  *
1904  * Returns: A boolean verdict on the congestion status
1905  */
1906 
gfs2_rgrp_congested(const struct gfs2_rgrpd * rgd,int loops)1907 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1908 {
1909 	const struct gfs2_glock *gl = rgd->rd_gl;
1910 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1911 	struct gfs2_lkstats *st;
1912 	u64 r_dcount, l_dcount;
1913 	u64 l_srttb, a_srttb = 0;
1914 	s64 srttb_diff;
1915 	u64 sqr_diff;
1916 	u64 var;
1917 	int cpu, nonzero = 0;
1918 
1919 	preempt_disable();
1920 	for_each_present_cpu(cpu) {
1921 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1922 		if (st->stats[GFS2_LKS_SRTTB]) {
1923 			a_srttb += st->stats[GFS2_LKS_SRTTB];
1924 			nonzero++;
1925 		}
1926 	}
1927 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1928 	if (nonzero)
1929 		do_div(a_srttb, nonzero);
1930 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1931 	var = st->stats[GFS2_LKS_SRTTVARB] +
1932 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1933 	preempt_enable();
1934 
1935 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1936 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1937 
1938 	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1939 		return false;
1940 
1941 	srttb_diff = a_srttb - l_srttb;
1942 	sqr_diff = srttb_diff * srttb_diff;
1943 
1944 	var *= 2;
1945 	if (l_dcount < 8 || r_dcount < 8)
1946 		var *= 2;
1947 	if (loops == 1)
1948 		var *= 2;
1949 
1950 	return ((srttb_diff < 0) && (sqr_diff > var));
1951 }
1952 
1953 /**
1954  * gfs2_rgrp_used_recently
1955  * @rs: The block reservation with the rgrp to test
1956  * @msecs: The time limit in milliseconds
1957  *
1958  * Returns: True if the rgrp glock has been used within the time limit
1959  */
gfs2_rgrp_used_recently(const struct gfs2_blkreserv * rs,u64 msecs)1960 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1961 				    u64 msecs)
1962 {
1963 	u64 tdiff;
1964 
1965 	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1966                             rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1967 
1968 	return tdiff > (msecs * 1000 * 1000);
1969 }
1970 
gfs2_orlov_skip(const struct gfs2_inode * ip)1971 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1972 {
1973 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1974 	u32 skip;
1975 
1976 	get_random_bytes(&skip, sizeof(skip));
1977 	return skip % sdp->sd_rgrps;
1978 }
1979 
gfs2_select_rgrp(struct gfs2_rgrpd ** pos,const struct gfs2_rgrpd * begin)1980 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1981 {
1982 	struct gfs2_rgrpd *rgd = *pos;
1983 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1984 
1985 	rgd = gfs2_rgrpd_get_next(rgd);
1986 	if (rgd == NULL)
1987 		rgd = gfs2_rgrpd_get_first(sdp);
1988 	*pos = rgd;
1989 	if (rgd != begin) /* If we didn't wrap */
1990 		return true;
1991 	return false;
1992 }
1993 
1994 /**
1995  * fast_to_acquire - determine if a resource group will be fast to acquire
1996  *
1997  * If this is one of our preferred rgrps, it should be quicker to acquire,
1998  * because we tried to set ourselves up as dlm lock master.
1999  */
fast_to_acquire(struct gfs2_rgrpd * rgd)2000 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
2001 {
2002 	struct gfs2_glock *gl = rgd->rd_gl;
2003 
2004 	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
2005 	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
2006 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
2007 		return 1;
2008 	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
2009 		return 1;
2010 	return 0;
2011 }
2012 
2013 /**
2014  * gfs2_inplace_reserve - Reserve space in the filesystem
2015  * @ip: the inode to reserve space for
2016  * @ap: the allocation parameters
2017  *
2018  * We try our best to find an rgrp that has at least ap->target blocks
2019  * available. After a couple of passes (loops == 2), the prospects of finding
2020  * such an rgrp diminish. At this stage, we return the first rgrp that has
2021  * atleast ap->min_target blocks available. Either way, we set ap->allowed to
2022  * the number of blocks available in the chosen rgrp.
2023  *
2024  * Returns: 0 on success,
2025  *          -ENOMEM if a suitable rgrp can't be found
2026  *          errno otherwise
2027  */
2028 
gfs2_inplace_reserve(struct gfs2_inode * ip,struct gfs2_alloc_parms * ap)2029 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2030 {
2031 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2032 	struct gfs2_rgrpd *begin = NULL;
2033 	struct gfs2_blkreserv *rs = &ip->i_res;
2034 	int error = 0, rg_locked, flags = 0;
2035 	u64 last_unlinked = NO_BLOCK;
2036 	int loops = 0;
2037 	u32 free_blocks, skip = 0;
2038 
2039 	if (sdp->sd_args.ar_rgrplvb)
2040 		flags |= GL_SKIP;
2041 	if (gfs2_assert_warn(sdp, ap->target))
2042 		return -EINVAL;
2043 	if (gfs2_rs_active(rs)) {
2044 		begin = rs->rs_rbm.rgd;
2045 	} else if (rs->rs_rbm.rgd &&
2046 		   rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
2047 		begin = rs->rs_rbm.rgd;
2048 	} else {
2049 		check_and_update_goal(ip);
2050 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2051 	}
2052 	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2053 		skip = gfs2_orlov_skip(ip);
2054 	if (rs->rs_rbm.rgd == NULL)
2055 		return -EBADSLT;
2056 
2057 	while (loops < 3) {
2058 		rg_locked = 1;
2059 
2060 		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
2061 			rg_locked = 0;
2062 			if (skip && skip--)
2063 				goto next_rgrp;
2064 			if (!gfs2_rs_active(rs)) {
2065 				if (loops == 0 &&
2066 				    !fast_to_acquire(rs->rs_rbm.rgd))
2067 					goto next_rgrp;
2068 				if ((loops < 2) &&
2069 				    gfs2_rgrp_used_recently(rs, 1000) &&
2070 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2071 					goto next_rgrp;
2072 			}
2073 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2074 						   LM_ST_EXCLUSIVE, flags,
2075 						   &rs->rs_rgd_gh);
2076 			if (unlikely(error))
2077 				return error;
2078 			if (!gfs2_rs_active(rs) && (loops < 2) &&
2079 			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2080 				goto skip_rgrp;
2081 			if (sdp->sd_args.ar_rgrplvb) {
2082 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2083 				if (unlikely(error)) {
2084 					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2085 					return error;
2086 				}
2087 			}
2088 		}
2089 
2090 		/* Skip unuseable resource groups */
2091 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2092 						 GFS2_RDF_ERROR)) ||
2093 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2094 			goto skip_rgrp;
2095 
2096 		if (sdp->sd_args.ar_rgrplvb)
2097 			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2098 
2099 		/* Get a reservation if we don't already have one */
2100 		if (!gfs2_rs_active(rs))
2101 			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2102 
2103 		/* Skip rgrps when we can't get a reservation on first pass */
2104 		if (!gfs2_rs_active(rs) && (loops < 1))
2105 			goto check_rgrp;
2106 
2107 		/* If rgrp has enough free space, use it */
2108 		free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
2109 		if (free_blocks >= ap->target ||
2110 		    (loops == 2 && ap->min_target &&
2111 		     free_blocks >= ap->min_target)) {
2112 			ap->allowed = free_blocks;
2113 			return 0;
2114 		}
2115 check_rgrp:
2116 		/* Check for unlinked inodes which can be reclaimed */
2117 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2118 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2119 					ip->i_no_addr);
2120 skip_rgrp:
2121 		/* Drop reservation, if we couldn't use reserved rgrp */
2122 		if (gfs2_rs_active(rs))
2123 			gfs2_rs_deltree(rs);
2124 
2125 		/* Unlock rgrp if required */
2126 		if (!rg_locked)
2127 			gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2128 next_rgrp:
2129 		/* Find the next rgrp, and continue looking */
2130 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2131 			continue;
2132 		if (skip)
2133 			continue;
2134 
2135 		/* If we've scanned all the rgrps, but found no free blocks
2136 		 * then this checks for some less likely conditions before
2137 		 * trying again.
2138 		 */
2139 		loops++;
2140 		/* Check that fs hasn't grown if writing to rindex */
2141 		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2142 			error = gfs2_ri_update(ip);
2143 			if (error)
2144 				return error;
2145 		}
2146 		/* Flushing the log may release space */
2147 		if (loops == 2)
2148 			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2149 				       GFS2_LFC_INPLACE_RESERVE);
2150 	}
2151 
2152 	return -ENOSPC;
2153 }
2154 
2155 /**
2156  * gfs2_inplace_release - release an inplace reservation
2157  * @ip: the inode the reservation was taken out on
2158  *
2159  * Release a reservation made by gfs2_inplace_reserve().
2160  */
2161 
gfs2_inplace_release(struct gfs2_inode * ip)2162 void gfs2_inplace_release(struct gfs2_inode *ip)
2163 {
2164 	struct gfs2_blkreserv *rs = &ip->i_res;
2165 
2166 	if (gfs2_holder_initialized(&rs->rs_rgd_gh))
2167 		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2168 }
2169 
2170 /**
2171  * gfs2_alloc_extent - allocate an extent from a given bitmap
2172  * @rbm: the resource group information
2173  * @dinode: TRUE if the first block we allocate is for a dinode
2174  * @n: The extent length (value/result)
2175  *
2176  * Add the bitmap buffer to the transaction.
2177  * Set the found bits to @new_state to change block's allocation state.
2178  */
gfs2_alloc_extent(const struct gfs2_rbm * rbm,bool dinode,unsigned int * n)2179 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2180 			     unsigned int *n)
2181 {
2182 	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2183 	const unsigned int elen = *n;
2184 	u64 block;
2185 	int ret;
2186 
2187 	*n = 1;
2188 	block = gfs2_rbm_to_block(rbm);
2189 	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2190 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2191 	block++;
2192 	while (*n < elen) {
2193 		ret = gfs2_rbm_from_block(&pos, block);
2194 		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2195 			break;
2196 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2197 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2198 		(*n)++;
2199 		block++;
2200 	}
2201 }
2202 
2203 /**
2204  * rgblk_free - Change alloc state of given block(s)
2205  * @sdp: the filesystem
2206  * @bstart: the start of a run of blocks to free
2207  * @blen: the length of the block run (all must lie within ONE RG!)
2208  * @new_state: GFS2_BLKST_XXX the after-allocation block state
2209  *
2210  * Returns:  Resource group containing the block(s)
2211  */
2212 
rgblk_free(struct gfs2_sbd * sdp,u64 bstart,u32 blen,unsigned char new_state)2213 static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
2214 				     u32 blen, unsigned char new_state)
2215 {
2216 	struct gfs2_rbm rbm;
2217 	struct gfs2_bitmap *bi, *bi_prev = NULL;
2218 
2219 	rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
2220 	if (!rbm.rgd) {
2221 		if (gfs2_consist(sdp))
2222 			fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
2223 		return NULL;
2224 	}
2225 
2226 	gfs2_rbm_from_block(&rbm, bstart);
2227 	while (blen--) {
2228 		bi = rbm_bi(&rbm);
2229 		if (bi != bi_prev) {
2230 			if (!bi->bi_clone) {
2231 				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2232 						      GFP_NOFS | __GFP_NOFAIL);
2233 				memcpy(bi->bi_clone + bi->bi_offset,
2234 				       bi->bi_bh->b_data + bi->bi_offset,
2235 				       bi->bi_len);
2236 			}
2237 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2238 			bi_prev = bi;
2239 		}
2240 		gfs2_setbit(&rbm, false, new_state);
2241 		gfs2_rbm_incr(&rbm);
2242 	}
2243 
2244 	return rbm.rgd;
2245 }
2246 
2247 /**
2248  * gfs2_rgrp_dump - print out an rgrp
2249  * @seq: The iterator
2250  * @gl: The glock in question
2251  *
2252  */
2253 
gfs2_rgrp_dump(struct seq_file * seq,const struct gfs2_glock * gl)2254 void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
2255 {
2256 	struct gfs2_rgrpd *rgd = gl->gl_object;
2257 	struct gfs2_blkreserv *trs;
2258 	const struct rb_node *n;
2259 
2260 	if (rgd == NULL)
2261 		return;
2262 	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2263 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2264 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2265 		       rgd->rd_reserved, rgd->rd_extfail_pt);
2266 	spin_lock(&rgd->rd_rsspin);
2267 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2268 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2269 		dump_rs(seq, trs);
2270 	}
2271 	spin_unlock(&rgd->rd_rsspin);
2272 }
2273 
gfs2_rgrp_error(struct gfs2_rgrpd * rgd)2274 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2275 {
2276 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2277 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2278 		(unsigned long long)rgd->rd_addr);
2279 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2280 	gfs2_rgrp_dump(NULL, rgd->rd_gl);
2281 	rgd->rd_flags |= GFS2_RDF_ERROR;
2282 }
2283 
2284 /**
2285  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2286  * @ip: The inode we have just allocated blocks for
2287  * @rbm: The start of the allocated blocks
2288  * @len: The extent length
2289  *
2290  * Adjusts a reservation after an allocation has taken place. If the
2291  * reservation does not match the allocation, or if it is now empty
2292  * then it is removed.
2293  */
2294 
gfs2_adjust_reservation(struct gfs2_inode * ip,const struct gfs2_rbm * rbm,unsigned len)2295 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2296 				    const struct gfs2_rbm *rbm, unsigned len)
2297 {
2298 	struct gfs2_blkreserv *rs = &ip->i_res;
2299 	struct gfs2_rgrpd *rgd = rbm->rgd;
2300 	unsigned rlen;
2301 	u64 block;
2302 	int ret;
2303 
2304 	spin_lock(&rgd->rd_rsspin);
2305 	if (gfs2_rs_active(rs)) {
2306 		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2307 			block = gfs2_rbm_to_block(rbm);
2308 			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2309 			rlen = min(rs->rs_free, len);
2310 			rs->rs_free -= rlen;
2311 			rgd->rd_reserved -= rlen;
2312 			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2313 			if (rs->rs_free && !ret)
2314 				goto out;
2315 			/* We used up our block reservation, so we should
2316 			   reserve more blocks next time. */
2317 			atomic_add(RGRP_RSRV_ADDBLKS, &rs->rs_sizehint);
2318 		}
2319 		__rs_deltree(rs);
2320 	}
2321 out:
2322 	spin_unlock(&rgd->rd_rsspin);
2323 }
2324 
2325 /**
2326  * gfs2_set_alloc_start - Set starting point for block allocation
2327  * @rbm: The rbm which will be set to the required location
2328  * @ip: The gfs2 inode
2329  * @dinode: Flag to say if allocation includes a new inode
2330  *
2331  * This sets the starting point from the reservation if one is active
2332  * otherwise it falls back to guessing a start point based on the
2333  * inode's goal block or the last allocation point in the rgrp.
2334  */
2335 
gfs2_set_alloc_start(struct gfs2_rbm * rbm,const struct gfs2_inode * ip,bool dinode)2336 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2337 				 const struct gfs2_inode *ip, bool dinode)
2338 {
2339 	u64 goal;
2340 
2341 	if (gfs2_rs_active(&ip->i_res)) {
2342 		*rbm = ip->i_res.rs_rbm;
2343 		return;
2344 	}
2345 
2346 	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2347 		goal = ip->i_goal;
2348 	else
2349 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2350 
2351 	gfs2_rbm_from_block(rbm, goal);
2352 }
2353 
2354 /**
2355  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2356  * @ip: the inode to allocate the block for
2357  * @bn: Used to return the starting block number
2358  * @nblocks: requested number of blocks/extent length (value/result)
2359  * @dinode: 1 if we're allocating a dinode block, else 0
2360  * @generation: the generation number of the inode
2361  *
2362  * Returns: 0 or error
2363  */
2364 
gfs2_alloc_blocks(struct gfs2_inode * ip,u64 * bn,unsigned int * nblocks,bool dinode,u64 * generation)2365 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2366 		      bool dinode, u64 *generation)
2367 {
2368 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2369 	struct buffer_head *dibh;
2370 	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
2371 	unsigned int ndata;
2372 	u64 block; /* block, within the file system scope */
2373 	int error;
2374 
2375 	gfs2_set_alloc_start(&rbm, ip, dinode);
2376 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
2377 
2378 	if (error == -ENOSPC) {
2379 		gfs2_set_alloc_start(&rbm, ip, dinode);
2380 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
2381 	}
2382 
2383 	/* Since all blocks are reserved in advance, this shouldn't happen */
2384 	if (error) {
2385 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2386 			(unsigned long long)ip->i_no_addr, error, *nblocks,
2387 			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2388 			rbm.rgd->rd_extfail_pt);
2389 		goto rgrp_error;
2390 	}
2391 
2392 	gfs2_alloc_extent(&rbm, dinode, nblocks);
2393 	block = gfs2_rbm_to_block(&rbm);
2394 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2395 	if (gfs2_rs_active(&ip->i_res))
2396 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2397 	ndata = *nblocks;
2398 	if (dinode)
2399 		ndata--;
2400 
2401 	if (!dinode) {
2402 		ip->i_goal = block + ndata - 1;
2403 		error = gfs2_meta_inode_buffer(ip, &dibh);
2404 		if (error == 0) {
2405 			struct gfs2_dinode *di =
2406 				(struct gfs2_dinode *)dibh->b_data;
2407 			gfs2_trans_add_meta(ip->i_gl, dibh);
2408 			di->di_goal_meta = di->di_goal_data =
2409 				cpu_to_be64(ip->i_goal);
2410 			brelse(dibh);
2411 		}
2412 	}
2413 	if (rbm.rgd->rd_free < *nblocks) {
2414 		pr_warn("nblocks=%u\n", *nblocks);
2415 		goto rgrp_error;
2416 	}
2417 
2418 	rbm.rgd->rd_free -= *nblocks;
2419 	if (dinode) {
2420 		rbm.rgd->rd_dinodes++;
2421 		*generation = rbm.rgd->rd_igeneration++;
2422 		if (*generation == 0)
2423 			*generation = rbm.rgd->rd_igeneration++;
2424 	}
2425 
2426 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2427 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2428 
2429 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2430 	if (dinode)
2431 		gfs2_trans_add_unrevoke(sdp, block, *nblocks);
2432 
2433 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2434 
2435 	rbm.rgd->rd_free_clone -= *nblocks;
2436 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2437 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2438 	*bn = block;
2439 	return 0;
2440 
2441 rgrp_error:
2442 	gfs2_rgrp_error(rbm.rgd);
2443 	return -EIO;
2444 }
2445 
2446 /**
2447  * __gfs2_free_blocks - free a contiguous run of block(s)
2448  * @ip: the inode these blocks are being freed from
2449  * @bstart: first block of a run of contiguous blocks
2450  * @blen: the length of the block run
2451  * @meta: 1 if the blocks represent metadata
2452  *
2453  */
2454 
__gfs2_free_blocks(struct gfs2_inode * ip,u64 bstart,u32 blen,int meta)2455 void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
2456 {
2457 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2458 	struct gfs2_rgrpd *rgd;
2459 
2460 	rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
2461 	if (!rgd)
2462 		return;
2463 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2464 	rgd->rd_free += blen;
2465 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2466 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2467 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2468 
2469 	/* Directories keep their data in the metadata address space */
2470 	if (meta || ip->i_depth)
2471 		gfs2_meta_wipe(ip, bstart, blen);
2472 }
2473 
2474 /**
2475  * gfs2_free_meta - free a contiguous run of data block(s)
2476  * @ip: the inode these blocks are being freed from
2477  * @bstart: first block of a run of contiguous blocks
2478  * @blen: the length of the block run
2479  *
2480  */
2481 
gfs2_free_meta(struct gfs2_inode * ip,u64 bstart,u32 blen)2482 void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
2483 {
2484 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2485 
2486 	__gfs2_free_blocks(ip, bstart, blen, 1);
2487 	gfs2_statfs_change(sdp, 0, +blen, 0);
2488 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2489 }
2490 
gfs2_unlink_di(struct inode * inode)2491 void gfs2_unlink_di(struct inode *inode)
2492 {
2493 	struct gfs2_inode *ip = GFS2_I(inode);
2494 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2495 	struct gfs2_rgrpd *rgd;
2496 	u64 blkno = ip->i_no_addr;
2497 
2498 	rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
2499 	if (!rgd)
2500 		return;
2501 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2502 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2503 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2504 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2505 }
2506 
gfs2_free_di(struct gfs2_rgrpd * rgd,struct gfs2_inode * ip)2507 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2508 {
2509 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2510 	struct gfs2_rgrpd *tmp_rgd;
2511 
2512 	tmp_rgd = rgblk_free(sdp, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2513 	if (!tmp_rgd)
2514 		return;
2515 	gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
2516 
2517 	if (!rgd->rd_dinodes)
2518 		gfs2_consist_rgrpd(rgd);
2519 	rgd->rd_dinodes--;
2520 	rgd->rd_free++;
2521 
2522 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2523 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2524 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2525 
2526 	gfs2_statfs_change(sdp, 0, +1, -1);
2527 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2528 	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2529 	gfs2_meta_wipe(ip, ip->i_no_addr, 1);
2530 }
2531 
2532 /**
2533  * gfs2_check_blk_type - Check the type of a block
2534  * @sdp: The superblock
2535  * @no_addr: The block number to check
2536  * @type: The block type we are looking for
2537  *
2538  * Returns: 0 if the block type matches the expected type
2539  *          -ESTALE if it doesn't match
2540  *          or -ve errno if something went wrong while checking
2541  */
2542 
gfs2_check_blk_type(struct gfs2_sbd * sdp,u64 no_addr,unsigned int type)2543 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2544 {
2545 	struct gfs2_rgrpd *rgd;
2546 	struct gfs2_holder rgd_gh;
2547 	struct gfs2_rbm rbm;
2548 	int error = -EINVAL;
2549 
2550 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2551 	if (!rgd)
2552 		goto fail;
2553 
2554 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2555 	if (error)
2556 		goto fail;
2557 
2558 	rbm.rgd = rgd;
2559 	error = gfs2_rbm_from_block(&rbm, no_addr);
2560 	WARN_ON_ONCE(error != 0);
2561 
2562 	if (gfs2_testbit(&rbm, false) != type)
2563 		error = -ESTALE;
2564 
2565 	gfs2_glock_dq_uninit(&rgd_gh);
2566 fail:
2567 	return error;
2568 }
2569 
2570 /**
2571  * gfs2_rlist_add - add a RG to a list of RGs
2572  * @ip: the inode
2573  * @rlist: the list of resource groups
2574  * @block: the block
2575  *
2576  * Figure out what RG a block belongs to and add that RG to the list
2577  *
2578  * FIXME: Don't use NOFAIL
2579  *
2580  */
2581 
gfs2_rlist_add(struct gfs2_inode * ip,struct gfs2_rgrp_list * rlist,u64 block)2582 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2583 		    u64 block)
2584 {
2585 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2586 	struct gfs2_rgrpd *rgd;
2587 	struct gfs2_rgrpd **tmp;
2588 	unsigned int new_space;
2589 	unsigned int x;
2590 
2591 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2592 		return;
2593 
2594 	/*
2595 	 * The resource group last accessed is kept in the last position.
2596 	 */
2597 
2598 	if (rlist->rl_rgrps) {
2599 		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2600 		if (rgrp_contains_block(rgd, block))
2601 			return;
2602 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2603 	} else {
2604 		rgd = ip->i_res.rs_rbm.rgd;
2605 		if (!rgd || !rgrp_contains_block(rgd, block))
2606 			rgd = gfs2_blk2rgrpd(sdp, block, 1);
2607 	}
2608 
2609 	if (!rgd) {
2610 		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2611 		       (unsigned long long)block);
2612 		return;
2613 	}
2614 
2615 	for (x = 0; x < rlist->rl_rgrps; x++) {
2616 		if (rlist->rl_rgd[x] == rgd) {
2617 			swap(rlist->rl_rgd[x],
2618 			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
2619 			return;
2620 		}
2621 	}
2622 
2623 	if (rlist->rl_rgrps == rlist->rl_space) {
2624 		new_space = rlist->rl_space + 10;
2625 
2626 		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2627 			      GFP_NOFS | __GFP_NOFAIL);
2628 
2629 		if (rlist->rl_rgd) {
2630 			memcpy(tmp, rlist->rl_rgd,
2631 			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2632 			kfree(rlist->rl_rgd);
2633 		}
2634 
2635 		rlist->rl_space = new_space;
2636 		rlist->rl_rgd = tmp;
2637 	}
2638 
2639 	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2640 }
2641 
2642 /**
2643  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2644  *      and initialize an array of glock holders for them
2645  * @rlist: the list of resource groups
2646  * @state: the lock state to acquire the RG lock in
2647  *
2648  * FIXME: Don't use NOFAIL
2649  *
2650  */
2651 
gfs2_rlist_alloc(struct gfs2_rgrp_list * rlist,unsigned int state)2652 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
2653 {
2654 	unsigned int x;
2655 
2656 	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2657 				      sizeof(struct gfs2_holder),
2658 				      GFP_NOFS | __GFP_NOFAIL);
2659 	for (x = 0; x < rlist->rl_rgrps; x++)
2660 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2661 				state, 0,
2662 				&rlist->rl_ghs[x]);
2663 }
2664 
2665 /**
2666  * gfs2_rlist_free - free a resource group list
2667  * @rlist: the list of resource groups
2668  *
2669  */
2670 
gfs2_rlist_free(struct gfs2_rgrp_list * rlist)2671 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2672 {
2673 	unsigned int x;
2674 
2675 	kfree(rlist->rl_rgd);
2676 
2677 	if (rlist->rl_ghs) {
2678 		for (x = 0; x < rlist->rl_rgrps; x++)
2679 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2680 		kfree(rlist->rl_ghs);
2681 		rlist->rl_ghs = NULL;
2682 	}
2683 }
2684 
2685