Branch data Line data Source code
1 : : /*
2 : : * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 : : * Written by Alex Tomas <alex@clusterfs.com>
4 : : *
5 : : * This program is free software; you can redistribute it and/or modify
6 : : * it under the terms of the GNU General Public License version 2 as
7 : : * published by the Free Software Foundation.
8 : : *
9 : : * This program is distributed in the hope that it will be useful,
10 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : : * GNU General Public License for more details.
13 : : *
14 : : * You should have received a copy of the GNU General Public Licens
15 : : * along with this program; if not, write to the Free Software
16 : : * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 : : */
18 : :
19 : :
20 : : /*
21 : : * mballoc.c contains the multiblocks allocation routines
22 : : */
23 : :
24 : : #include "ext4_jbd2.h"
25 : : #include "mballoc.h"
26 : : #include <linux/log2.h>
27 : : #include <linux/module.h>
28 : : #include <linux/slab.h>
29 : : #include <trace/events/ext4.h>
30 : :
31 : : #ifdef CONFIG_EXT4_DEBUG
32 : : ushort ext4_mballoc_debug __read_mostly;
33 : :
34 : : module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
35 : : MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
36 : : #endif
37 : :
38 : : /*
39 : : * MUSTDO:
40 : : * - test ext4_ext_search_left() and ext4_ext_search_right()
41 : : * - search for metadata in few groups
42 : : *
43 : : * TODO v4:
44 : : * - normalization should take into account whether file is still open
45 : : * - discard preallocations if no free space left (policy?)
46 : : * - don't normalize tails
47 : : * - quota
48 : : * - reservation for superuser
49 : : *
50 : : * TODO v3:
51 : : * - bitmap read-ahead (proposed by Oleg Drokin aka green)
52 : : * - track min/max extents in each group for better group selection
53 : : * - mb_mark_used() may allocate chunk right after splitting buddy
54 : : * - tree of groups sorted by number of free blocks
55 : : * - error handling
56 : : */
57 : :
58 : : /*
59 : : * The allocation request involve request for multiple number of blocks
60 : : * near to the goal(block) value specified.
61 : : *
62 : : * During initialization phase of the allocator we decide to use the
63 : : * group preallocation or inode preallocation depending on the size of
64 : : * the file. The size of the file could be the resulting file size we
65 : : * would have after allocation, or the current file size, which ever
66 : : * is larger. If the size is less than sbi->s_mb_stream_request we
67 : : * select to use the group preallocation. The default value of
68 : : * s_mb_stream_request is 16 blocks. This can also be tuned via
69 : : * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
70 : : * terms of number of blocks.
71 : : *
72 : : * The main motivation for having small file use group preallocation is to
73 : : * ensure that we have small files closer together on the disk.
74 : : *
75 : : * First stage the allocator looks at the inode prealloc list,
76 : : * ext4_inode_info->i_prealloc_list, which contains list of prealloc
77 : : * spaces for this particular inode. The inode prealloc space is
78 : : * represented as:
79 : : *
80 : : * pa_lstart -> the logical start block for this prealloc space
81 : : * pa_pstart -> the physical start block for this prealloc space
82 : : * pa_len -> length for this prealloc space (in clusters)
83 : : * pa_free -> free space available in this prealloc space (in clusters)
84 : : *
85 : : * The inode preallocation space is used looking at the _logical_ start
86 : : * block. If only the logical file block falls within the range of prealloc
87 : : * space we will consume the particular prealloc space. This makes sure that
88 : : * we have contiguous physical blocks representing the file blocks
89 : : *
90 : : * The important thing to be noted in case of inode prealloc space is that
91 : : * we don't modify the values associated to inode prealloc space except
92 : : * pa_free.
93 : : *
94 : : * If we are not able to find blocks in the inode prealloc space and if we
95 : : * have the group allocation flag set then we look at the locality group
96 : : * prealloc space. These are per CPU prealloc list represented as
97 : : *
98 : : * ext4_sb_info.s_locality_groups[smp_processor_id()]
99 : : *
100 : : * The reason for having a per cpu locality group is to reduce the contention
101 : : * between CPUs. It is possible to get scheduled at this point.
102 : : *
103 : : * The locality group prealloc space is used looking at whether we have
104 : : * enough free space (pa_free) within the prealloc space.
105 : : *
106 : : * If we can't allocate blocks via inode prealloc or/and locality group
107 : : * prealloc then we look at the buddy cache. The buddy cache is represented
108 : : * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
109 : : * mapped to the buddy and bitmap information regarding different
110 : : * groups. The buddy information is attached to buddy cache inode so that
111 : : * we can access them through the page cache. The information regarding
112 : : * each group is loaded via ext4_mb_load_buddy. The information involve
113 : : * block bitmap and buddy information. The information are stored in the
114 : : * inode as:
115 : : *
116 : : * { page }
117 : : * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
118 : : *
119 : : *
120 : : * one block each for bitmap and buddy information. So for each group we
121 : : * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
122 : : * blocksize) blocks. So it can have information regarding groups_per_page
123 : : * which is blocks_per_page/2
124 : : *
125 : : * The buddy cache inode is not stored on disk. The inode is thrown
126 : : * away when the filesystem is unmounted.
127 : : *
128 : : * We look for count number of blocks in the buddy cache. If we were able
129 : : * to locate that many free blocks we return with additional information
130 : : * regarding rest of the contiguous physical block available
131 : : *
132 : : * Before allocating blocks via buddy cache we normalize the request
133 : : * blocks. This ensure we ask for more blocks that we needed. The extra
134 : : * blocks that we get after allocation is added to the respective prealloc
135 : : * list. In case of inode preallocation we follow a list of heuristics
136 : : * based on file size. This can be found in ext4_mb_normalize_request. If
137 : : * we are doing a group prealloc we try to normalize the request to
138 : : * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is
139 : : * dependent on the cluster size; for non-bigalloc file systems, it is
140 : : * 512 blocks. This can be tuned via
141 : : * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
142 : : * terms of number of blocks. If we have mounted the file system with -O
143 : : * stripe=<value> option the group prealloc request is normalized to the
144 : : * the smallest multiple of the stripe value (sbi->s_stripe) which is
145 : : * greater than the default mb_group_prealloc.
146 : : *
147 : : * The regular allocator (using the buddy cache) supports a few tunables.
148 : : *
149 : : * /sys/fs/ext4/<partition>/mb_min_to_scan
150 : : * /sys/fs/ext4/<partition>/mb_max_to_scan
151 : : * /sys/fs/ext4/<partition>/mb_order2_req
152 : : *
153 : : * The regular allocator uses buddy scan only if the request len is power of
154 : : * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
155 : : * value of s_mb_order2_reqs can be tuned via
156 : : * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
157 : : * stripe size (sbi->s_stripe), we try to search for contiguous block in
158 : : * stripe size. This should result in better allocation on RAID setups. If
159 : : * not, we search in the specific group using bitmap for best extents. The
160 : : * tunable min_to_scan and max_to_scan control the behaviour here.
161 : : * min_to_scan indicate how long the mballoc __must__ look for a best
162 : : * extent and max_to_scan indicates how long the mballoc __can__ look for a
163 : : * best extent in the found extents. Searching for the blocks starts with
164 : : * the group specified as the goal value in allocation context via
165 : : * ac_g_ex. Each group is first checked based on the criteria whether it
166 : : * can be used for allocation. ext4_mb_good_group explains how the groups are
167 : : * checked.
168 : : *
169 : : * Both the prealloc space are getting populated as above. So for the first
170 : : * request we will hit the buddy cache which will result in this prealloc
171 : : * space getting filled. The prealloc space is then later used for the
172 : : * subsequent request.
173 : : */
174 : :
175 : : /*
176 : : * mballoc operates on the following data:
177 : : * - on-disk bitmap
178 : : * - in-core buddy (actually includes buddy and bitmap)
179 : : * - preallocation descriptors (PAs)
180 : : *
181 : : * there are two types of preallocations:
182 : : * - inode
183 : : * assiged to specific inode and can be used for this inode only.
184 : : * it describes part of inode's space preallocated to specific
185 : : * physical blocks. any block from that preallocated can be used
186 : : * independent. the descriptor just tracks number of blocks left
187 : : * unused. so, before taking some block from descriptor, one must
188 : : * make sure corresponded logical block isn't allocated yet. this
189 : : * also means that freeing any block within descriptor's range
190 : : * must discard all preallocated blocks.
191 : : * - locality group
192 : : * assigned to specific locality group which does not translate to
193 : : * permanent set of inodes: inode can join and leave group. space
194 : : * from this type of preallocation can be used for any inode. thus
195 : : * it's consumed from the beginning to the end.
196 : : *
197 : : * relation between them can be expressed as:
198 : : * in-core buddy = on-disk bitmap + preallocation descriptors
199 : : *
200 : : * this mean blocks mballoc considers used are:
201 : : * - allocated blocks (persistent)
202 : : * - preallocated blocks (non-persistent)
203 : : *
204 : : * consistency in mballoc world means that at any time a block is either
205 : : * free or used in ALL structures. notice: "any time" should not be read
206 : : * literally -- time is discrete and delimited by locks.
207 : : *
208 : : * to keep it simple, we don't use block numbers, instead we count number of
209 : : * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
210 : : *
211 : : * all operations can be expressed as:
212 : : * - init buddy: buddy = on-disk + PAs
213 : : * - new PA: buddy += N; PA = N
214 : : * - use inode PA: on-disk += N; PA -= N
215 : : * - discard inode PA buddy -= on-disk - PA; PA = 0
216 : : * - use locality group PA on-disk += N; PA -= N
217 : : * - discard locality group PA buddy -= PA; PA = 0
218 : : * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
219 : : * is used in real operation because we can't know actual used
220 : : * bits from PA, only from on-disk bitmap
221 : : *
222 : : * if we follow this strict logic, then all operations above should be atomic.
223 : : * given some of them can block, we'd have to use something like semaphores
224 : : * killing performance on high-end SMP hardware. let's try to relax it using
225 : : * the following knowledge:
226 : : * 1) if buddy is referenced, it's already initialized
227 : : * 2) while block is used in buddy and the buddy is referenced,
228 : : * nobody can re-allocate that block
229 : : * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
230 : : * bit set and PA claims same block, it's OK. IOW, one can set bit in
231 : : * on-disk bitmap if buddy has same bit set or/and PA covers corresponded
232 : : * block
233 : : *
234 : : * so, now we're building a concurrency table:
235 : : * - init buddy vs.
236 : : * - new PA
237 : : * blocks for PA are allocated in the buddy, buddy must be referenced
238 : : * until PA is linked to allocation group to avoid concurrent buddy init
239 : : * - use inode PA
240 : : * we need to make sure that either on-disk bitmap or PA has uptodate data
241 : : * given (3) we care that PA-=N operation doesn't interfere with init
242 : : * - discard inode PA
243 : : * the simplest way would be to have buddy initialized by the discard
244 : : * - use locality group PA
245 : : * again PA-=N must be serialized with init
246 : : * - discard locality group PA
247 : : * the simplest way would be to have buddy initialized by the discard
248 : : * - new PA vs.
249 : : * - use inode PA
250 : : * i_data_sem serializes them
251 : : * - discard inode PA
252 : : * discard process must wait until PA isn't used by another process
253 : : * - use locality group PA
254 : : * some mutex should serialize them
255 : : * - discard locality group PA
256 : : * discard process must wait until PA isn't used by another process
257 : : * - use inode PA
258 : : * - use inode PA
259 : : * i_data_sem or another mutex should serializes them
260 : : * - discard inode PA
261 : : * discard process must wait until PA isn't used by another process
262 : : * - use locality group PA
263 : : * nothing wrong here -- they're different PAs covering different blocks
264 : : * - discard locality group PA
265 : : * discard process must wait until PA isn't used by another process
266 : : *
267 : : * now we're ready to make few consequences:
268 : : * - PA is referenced and while it is no discard is possible
269 : : * - PA is referenced until block isn't marked in on-disk bitmap
270 : : * - PA changes only after on-disk bitmap
271 : : * - discard must not compete with init. either init is done before
272 : : * any discard or they're serialized somehow
273 : : * - buddy init as sum of on-disk bitmap and PAs is done atomically
274 : : *
275 : : * a special case when we've used PA to emptiness. no need to modify buddy
276 : : * in this case, but we should care about concurrent init
277 : : *
278 : : */
279 : :
280 : : /*
281 : : * Logic in few words:
282 : : *
283 : : * - allocation:
284 : : * load group
285 : : * find blocks
286 : : * mark bits in on-disk bitmap
287 : : * release group
288 : : *
289 : : * - use preallocation:
290 : : * find proper PA (per-inode or group)
291 : : * load group
292 : : * mark bits in on-disk bitmap
293 : : * release group
294 : : * release PA
295 : : *
296 : : * - free:
297 : : * load group
298 : : * mark bits in on-disk bitmap
299 : : * release group
300 : : *
301 : : * - discard preallocations in group:
302 : : * mark PAs deleted
303 : : * move them onto local list
304 : : * load on-disk bitmap
305 : : * load group
306 : : * remove PA from object (inode or locality group)
307 : : * mark free blocks in-core
308 : : *
309 : : * - discard inode's preallocations:
310 : : */
311 : :
312 : : /*
313 : : * Locking rules
314 : : *
315 : : * Locks:
316 : : * - bitlock on a group (group)
317 : : * - object (inode/locality) (object)
318 : : * - per-pa lock (pa)
319 : : *
320 : : * Paths:
321 : : * - new pa
322 : : * object
323 : : * group
324 : : *
325 : : * - find and use pa:
326 : : * pa
327 : : *
328 : : * - release consumed pa:
329 : : * pa
330 : : * group
331 : : * object
332 : : *
333 : : * - generate in-core bitmap:
334 : : * group
335 : : * pa
336 : : *
337 : : * - discard all for given object (inode, locality group):
338 : : * object
339 : : * pa
340 : : * group
341 : : *
342 : : * - discard all for given group:
343 : : * group
344 : : * pa
345 : : * group
346 : : * object
347 : : *
348 : : */
349 : : static struct kmem_cache *ext4_pspace_cachep;
350 : : static struct kmem_cache *ext4_ac_cachep;
351 : : static struct kmem_cache *ext4_free_data_cachep;
352 : :
353 : : /* We create slab caches for groupinfo data structures based on the
354 : : * superblock block size. There will be one per mounted filesystem for
355 : : * each unique s_blocksize_bits */
356 : : #define NR_GRPINFO_CACHES 8
357 : : static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
358 : :
359 : : static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
360 : : "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
361 : : "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
362 : : "ext4_groupinfo_64k", "ext4_groupinfo_128k"
363 : : };
364 : :
365 : : static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
366 : : ext4_group_t group);
367 : : static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
368 : : ext4_group_t group);
369 : : static void ext4_free_data_callback(struct super_block *sb,
370 : : struct ext4_journal_cb_entry *jce, int rc);
371 : :
372 : : static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
373 : : {
374 : : #if BITS_PER_LONG == 64
375 : : *bit += ((unsigned long) addr & 7UL) << 3;
376 : : addr = (void *) ((unsigned long) addr & ~7UL);
377 : : #elif BITS_PER_LONG == 32
378 : 12455361 : *bit += ((unsigned long) addr & 3UL) << 3;
379 : 12169959 : addr = (void *) ((unsigned long) addr & ~3UL);
380 : : #else
381 : : #error "how many bits you are?!"
382 : : #endif
383 : : return addr;
384 : : }
385 : :
386 : : static inline int mb_test_bit(int bit, void *addr)
387 : : {
388 : : /*
389 : : * ext4_test_bit on architecture like powerpc
390 : : * needs unsigned long aligned address
391 : : */
392 : : addr = mb_correct_addr_and_bit(&bit, addr);
393 : : return ext4_test_bit(bit, addr);
394 : : }
395 : :
396 : : static inline void mb_set_bit(int bit, void *addr)
397 : : {
398 : : addr = mb_correct_addr_and_bit(&bit, addr);
399 : : ext4_set_bit(bit, addr);
400 : : }
401 : :
402 : : static inline void mb_clear_bit(int bit, void *addr)
403 : : {
404 : : addr = mb_correct_addr_and_bit(&bit, addr);
405 : : ext4_clear_bit(bit, addr);
406 : : }
407 : :
408 : : static inline int mb_test_and_clear_bit(int bit, void *addr)
409 : : {
410 : : addr = mb_correct_addr_and_bit(&bit, addr);
411 : : return ext4_test_and_clear_bit(bit, addr);
412 : : }
413 : :
414 : : static inline int mb_find_next_zero_bit(void *addr, int max, int start)
415 : : {
416 : : int fix = 0, ret, tmpmax;
417 : : addr = mb_correct_addr_and_bit(&fix, addr);
418 : 437624 : tmpmax = max + fix;
419 : 587244 : start += fix;
420 : :
421 : 591168 : ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
422 [ + - ][ # # ]: 591255 : if (ret > max)
[ + + ][ + - ]
[ # # ][ + - ]
[ + - ]
423 : : return max;
424 : : return ret;
425 : : }
426 : :
427 : : static inline int mb_find_next_bit(void *addr, int max, int start)
428 : : {
429 : : int fix = 0, ret, tmpmax;
430 : : addr = mb_correct_addr_and_bit(&fix, addr);
431 : 82885 : tmpmax = max + fix;
432 : 85463 : start += fix;
433 : :
434 : 85481 : ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
435 [ + + ][ # # ]: 85481 : if (ret > max)
[ + - ]
436 : : return max;
437 : : return ret;
438 : : }
439 : :
440 : 0 : static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
441 : : {
442 : : char *bb;
443 : :
444 [ - + ]: 1959088 : BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
445 [ - + ]: 1959088 : BUG_ON(max == NULL);
446 : :
447 [ + + ]: 1959088 : if (order > e4b->bd_blkbits + 1) {
448 : 94 : *max = 0;
449 : 94 : return NULL;
450 : : }
451 : :
452 : : /* at order 0 we see each particular block */
453 [ + + ]: 1958994 : if (order == 0) {
454 : 971977 : *max = 1 << (e4b->bd_blkbits + 3);
455 : 971977 : return e4b->bd_bitmap;
456 : : }
457 : :
458 : 987017 : bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
459 : 987017 : *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
460 : :
461 : 987017 : return bb;
462 : : }
463 : :
464 : : #ifdef DOUBLE_CHECK
465 : : static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
466 : : int first, int count)
467 : : {
468 : : int i;
469 : : struct super_block *sb = e4b->bd_sb;
470 : :
471 : : if (unlikely(e4b->bd_info->bb_bitmap == NULL))
472 : : return;
473 : : assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
474 : : for (i = 0; i < count; i++) {
475 : : if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
476 : : ext4_fsblk_t blocknr;
477 : :
478 : : blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
479 : : blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
480 : : ext4_grp_locked_error(sb, e4b->bd_group,
481 : : inode ? inode->i_ino : 0,
482 : : blocknr,
483 : : "freeing block already freed "
484 : : "(bit %u)",
485 : : first + i);
486 : : }
487 : : mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
488 : : }
489 : : }
490 : :
491 : : static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
492 : : {
493 : : int i;
494 : :
495 : : if (unlikely(e4b->bd_info->bb_bitmap == NULL))
496 : : return;
497 : : assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
498 : : for (i = 0; i < count; i++) {
499 : : BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
500 : : mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
501 : : }
502 : : }
503 : :
504 : : static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
505 : : {
506 : : if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
507 : : unsigned char *b1, *b2;
508 : : int i;
509 : : b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
510 : : b2 = (unsigned char *) bitmap;
511 : : for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
512 : : if (b1[i] != b2[i]) {
513 : : ext4_msg(e4b->bd_sb, KERN_ERR,
514 : : "corruption in group %u "
515 : : "at byte %u(%u): %x in copy != %x "
516 : : "on disk/prealloc",
517 : : e4b->bd_group, i, i * 8, b1[i], b2[i]);
518 : : BUG();
519 : : }
520 : : }
521 : : }
522 : : }
523 : :
524 : : #else
525 : : static inline void mb_free_blocks_double(struct inode *inode,
526 : : struct ext4_buddy *e4b, int first, int count)
527 : : {
528 : : return;
529 : : }
530 : : static inline void mb_mark_used_double(struct ext4_buddy *e4b,
531 : : int first, int count)
532 : : {
533 : : return;
534 : : }
535 : : static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
536 : : {
537 : : return;
538 : : }
539 : : #endif
540 : :
541 : : #ifdef AGGRESSIVE_CHECK
542 : :
543 : : #define MB_CHECK_ASSERT(assert) \
544 : : do { \
545 : : if (!(assert)) { \
546 : : printk(KERN_EMERG \
547 : : "Assertion failure in %s() at %s:%d: \"%s\"\n", \
548 : : function, file, line, # assert); \
549 : : BUG(); \
550 : : } \
551 : : } while (0)
552 : :
553 : : static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
554 : : const char *function, int line)
555 : : {
556 : : struct super_block *sb = e4b->bd_sb;
557 : : int order = e4b->bd_blkbits + 1;
558 : : int max;
559 : : int max2;
560 : : int i;
561 : : int j;
562 : : int k;
563 : : int count;
564 : : struct ext4_group_info *grp;
565 : : int fragments = 0;
566 : : int fstart;
567 : : struct list_head *cur;
568 : : void *buddy;
569 : : void *buddy2;
570 : :
571 : : {
572 : : static int mb_check_counter;
573 : : if (mb_check_counter++ % 100 != 0)
574 : : return 0;
575 : : }
576 : :
577 : : while (order > 1) {
578 : : buddy = mb_find_buddy(e4b, order, &max);
579 : : MB_CHECK_ASSERT(buddy);
580 : : buddy2 = mb_find_buddy(e4b, order - 1, &max2);
581 : : MB_CHECK_ASSERT(buddy2);
582 : : MB_CHECK_ASSERT(buddy != buddy2);
583 : : MB_CHECK_ASSERT(max * 2 == max2);
584 : :
585 : : count = 0;
586 : : for (i = 0; i < max; i++) {
587 : :
588 : : if (mb_test_bit(i, buddy)) {
589 : : /* only single bit in buddy2 may be 1 */
590 : : if (!mb_test_bit(i << 1, buddy2)) {
591 : : MB_CHECK_ASSERT(
592 : : mb_test_bit((i<<1)+1, buddy2));
593 : : } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
594 : : MB_CHECK_ASSERT(
595 : : mb_test_bit(i << 1, buddy2));
596 : : }
597 : : continue;
598 : : }
599 : :
600 : : /* both bits in buddy2 must be 1 */
601 : : MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
602 : : MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
603 : :
604 : : for (j = 0; j < (1 << order); j++) {
605 : : k = (i * (1 << order)) + j;
606 : : MB_CHECK_ASSERT(
607 : : !mb_test_bit(k, e4b->bd_bitmap));
608 : : }
609 : : count++;
610 : : }
611 : : MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
612 : : order--;
613 : : }
614 : :
615 : : fstart = -1;
616 : : buddy = mb_find_buddy(e4b, 0, &max);
617 : : for (i = 0; i < max; i++) {
618 : : if (!mb_test_bit(i, buddy)) {
619 : : MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
620 : : if (fstart == -1) {
621 : : fragments++;
622 : : fstart = i;
623 : : }
624 : : continue;
625 : : }
626 : : fstart = -1;
627 : : /* check used bits only */
628 : : for (j = 0; j < e4b->bd_blkbits + 1; j++) {
629 : : buddy2 = mb_find_buddy(e4b, j, &max2);
630 : : k = i >> j;
631 : : MB_CHECK_ASSERT(k < max2);
632 : : MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
633 : : }
634 : : }
635 : : MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
636 : : MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
637 : :
638 : : grp = ext4_get_group_info(sb, e4b->bd_group);
639 : : list_for_each(cur, &grp->bb_prealloc_list) {
640 : : ext4_group_t groupnr;
641 : : struct ext4_prealloc_space *pa;
642 : : pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
643 : : ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
644 : : MB_CHECK_ASSERT(groupnr == e4b->bd_group);
645 : : for (i = 0; i < pa->pa_len; i++)
646 : : MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
647 : : }
648 : : return 0;
649 : : }
650 : : #undef MB_CHECK_ASSERT
651 : : #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
652 : : __FILE__, __func__, __LINE__)
653 : : #else
654 : : #define mb_check_buddy(e4b)
655 : : #endif
656 : :
657 : : /*
658 : : * Divide blocks started from @first with length @len into
659 : : * smaller chunks with power of 2 blocks.
660 : : * Clear the bits in bitmap which the blocks of the chunk(s) covered,
661 : : * then increase bb_counters[] for corresponded chunk size.
662 : : */
663 : 0 : static void ext4_mb_mark_free_simple(struct super_block *sb,
664 : : void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
665 : : struct ext4_group_info *grp)
666 : : {
667 : : struct ext4_sb_info *sbi = EXT4_SB(sb);
668 : : ext4_grpblk_t min;
669 : : ext4_grpblk_t max;
670 : : ext4_grpblk_t chunk;
671 : : unsigned short border;
672 : :
673 [ - + ]: 2351 : BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
674 : :
675 : 2351 : border = 2 << sb->s_blocksize_bits;
676 : :
677 [ + + ]: 9093 : while (len > 0) {
678 : : /* find how many blocks can be covered since this position */
679 : 13484 : max = ffs(first | border) - 1;
680 : :
681 : : /* find how many blocks of power 2 we need to mark */
682 : 6742 : min = fls(len) - 1;
683 : :
684 [ + + ]: 6742 : if (max < min)
685 : : min = max;
686 : 6742 : chunk = 1 << min;
687 : :
688 : : /* mark multiblock chunks only */
689 : 6742 : grp->bb_counters[min]++;
690 [ + + ]: 6742 : if (min > 0)
691 : 11156 : mb_clear_bit(first >> min,
692 : 5578 : buddy + sbi->s_mb_offsets[min]);
693 : :
694 : 6742 : len -= chunk;
695 : 6742 : first += chunk;
696 : : }
697 : 2351 : }
698 : :
699 : : /*
700 : : * Cache the order of the largest free extent we have available in this block
701 : : * group.
702 : : */
703 : : static void
704 : : mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
705 : : {
706 : : int i;
707 : : int bits;
708 : :
709 : 217477 : grp->bb_largest_free_order = -1; /* uninit */
710 : :
711 : 217477 : bits = sb->s_blocksize_bits + 1;
712 [ + + ][ + + ]: 286250 : for (i = bits; i >= 0; i--) {
[ + + ]
713 [ + + ][ + + ]: 286045 : if (grp->bb_counters[i] > 0) {
[ + + ]
714 : 217272 : grp->bb_largest_free_order = i;
715 : : break;
716 : : }
717 : : }
718 : : }
719 : :
720 : : static noinline_for_stack
721 : 0 : void ext4_mb_generate_buddy(struct super_block *sb,
722 : : void *buddy, void *bitmap, ext4_group_t group)
723 : : {
724 : : struct ext4_group_info *grp = ext4_get_group_info(sb, group);
725 : 168 : ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
726 : : ext4_grpblk_t i = 0;
727 : : ext4_grpblk_t first;
728 : : ext4_grpblk_t len;
729 : : unsigned free = 0;
730 : : unsigned fragments = 0;
731 [ + - ]: 168 : unsigned long long period = get_cycles();
732 : :
733 : : /* initialize buddy from bitmap which is aggregation
734 : : * of on-disk bitmap and preallocations */
735 : : i = mb_find_next_zero_bit(bitmap, max, 0);
736 : 168 : grp->bb_first_free = i;
737 [ + + ]: 2746 : while (i < max) {
738 : 2578 : fragments++;
739 : : first = i;
740 : : i = mb_find_next_bit(bitmap, max, i);
741 : 2578 : len = i - first;
742 : 2578 : free += len;
743 [ + + ]: 2578 : if (len > 1)
744 : 2351 : ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
745 : : else
746 : 227 : grp->bb_counters[0]++;
747 [ + + ]: 2578 : if (i < max)
748 : : i = mb_find_next_zero_bit(bitmap, max, i);
749 : : }
750 : 168 : grp->bb_fragments = fragments;
751 : :
752 [ - + ]: 168 : if (free != grp->bb_free) {
753 : 0 : ext4_grp_locked_error(sb, group, 0, 0,
754 : : "%u clusters in bitmap, %u in gd; "
755 : : "block bitmap corrupt.",
756 : : free, grp->bb_free);
757 : : /*
758 : : * If we intend to continue, we consider group descriptor
759 : : * corrupt and update bb_free using bitmap value
760 : : */
761 : 0 : grp->bb_free = free;
762 : 0 : set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
763 : : }
764 : : mb_set_largest_free_order(sb, grp);
765 : :
766 : 168 : clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
767 : :
768 [ + - ]: 168 : period = get_cycles() - period;
769 : : spin_lock(&EXT4_SB(sb)->s_bal_lock);
770 : 168 : EXT4_SB(sb)->s_mb_buddies_generated++;
771 : 168 : EXT4_SB(sb)->s_mb_generation_time += period;
772 : : spin_unlock(&EXT4_SB(sb)->s_bal_lock);
773 : 168 : }
774 : :
775 : 0 : static void mb_regenerate_buddy(struct ext4_buddy *e4b)
776 : : {
777 : : int count;
778 : : int order = 1;
779 : : void *buddy;
780 : :
781 [ # # ]: 0 : while ((buddy = mb_find_buddy(e4b, order++, &count))) {
782 : 0 : ext4_set_bits(buddy, 0, count);
783 : : }
784 : 0 : e4b->bd_info->bb_fragments = 0;
785 [ # # ]: 0 : memset(e4b->bd_info->bb_counters, 0,
786 : : sizeof(*e4b->bd_info->bb_counters) *
787 : : (e4b->bd_sb->s_blocksize_bits + 2));
788 : :
789 : 0 : ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
790 : : e4b->bd_bitmap, e4b->bd_group);
791 : 0 : }
792 : :
793 : : /* The buddy information is attached the buddy cache inode
794 : : * for convenience. The information regarding each group
795 : : * is loaded via ext4_mb_load_buddy. The information involve
796 : : * block bitmap and buddy information. The information are
797 : : * stored in the inode as
798 : : *
799 : : * { page }
800 : : * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
801 : : *
802 : : *
803 : : * one block each for bitmap and buddy information.
804 : : * So for each group we take up 2 blocks. A page can
805 : : * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
806 : : * So it can have information regarding groups_per_page which
807 : : * is blocks_per_page/2
808 : : *
809 : : * Locking note: This routine takes the block group lock of all groups
810 : : * for this page; do not hold this lock when calling this routine!
811 : : */
812 : :
813 : 0 : static int ext4_mb_init_cache(struct page *page, char *incore)
814 : : {
815 : : ext4_group_t ngroups;
816 : : int blocksize;
817 : : int blocks_per_page;
818 : : int groups_per_page;
819 : : int err = 0;
820 : : int i;
821 : : ext4_group_t first_group, group;
822 : : int first_block;
823 : : struct super_block *sb;
824 : : struct buffer_head *bhs;
825 : : struct buffer_head **bh = NULL;
826 : : struct inode *inode;
827 : : char *data;
828 : : char *bitmap;
829 : : struct ext4_group_info *grinfo;
830 : :
831 : : mb_debug(1, "init page %lu\n", page->index);
832 : :
833 : 336 : inode = page->mapping->host;
834 : 336 : sb = inode->i_sb;
835 : : ngroups = ext4_get_groups_count(sb);
836 : 336 : blocksize = 1 << inode->i_blkbits;
837 : 336 : blocks_per_page = PAGE_CACHE_SIZE / blocksize;
838 : :
839 : 336 : groups_per_page = blocks_per_page >> 1;
840 [ + - ]: 336 : if (groups_per_page == 0)
841 : : groups_per_page = 1;
842 : :
843 : : /* allocate buffer_heads to read bitmaps */
844 [ + - ]: 336 : if (groups_per_page > 1) {
845 : 0 : i = sizeof(struct buffer_head *) * groups_per_page;
846 : : bh = kzalloc(i, GFP_NOFS);
847 [ + - ]: 336 : if (bh == NULL) {
848 : : err = -ENOMEM;
849 : : goto out;
850 : : }
851 : : } else
852 : : bh = &bhs;
853 : :
854 : 336 : first_group = page->index * blocks_per_page / 2;
855 : :
856 : : /* read all groups the page covers into the cache */
857 [ + + ]: 672 : for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
858 [ + - ]: 336 : if (group >= ngroups)
859 : : break;
860 : :
861 : : grinfo = ext4_get_group_info(sb, group);
862 : : /*
863 : : * If page is uptodate then we came here after online resize
864 : : * which added some new uninitialized group info structs, so
865 : : * we must skip all initialized uptodate buddies on the page,
866 : : * which may be currently in use by an allocating task.
867 : : */
868 [ - + ][ # # ]: 336 : if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
869 : 0 : bh[i] = NULL;
870 : 0 : continue;
871 : : }
872 [ + - ]: 336 : if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
873 : : err = -ENOMEM;
874 : : goto out;
875 : : }
876 : : mb_debug(1, "read bitmap for group %u\n", group);
877 : : }
878 : :
879 : : /* wait for I/O completion */
880 [ + + ]: 672 : for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
881 [ + - ][ + - ]: 336 : if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
882 : : err = -EIO;
883 : : goto out;
884 : : }
885 : : }
886 : :
887 : 336 : first_block = page->index * blocks_per_page;
888 [ + + ]: 672 : for (i = 0; i < blocks_per_page; i++) {
889 : 336 : group = (first_block + i) >> 1;
890 [ + - ]: 336 : if (group >= ngroups)
891 : : break;
892 : :
893 [ - + ]: 336 : if (!bh[group - first_group])
894 : : /* skip initialized uptodate buddy */
895 : 0 : continue;
896 : :
897 : : /*
898 : : * data carry information regarding this
899 : : * particular group in the format specified
900 : : * above
901 : : *
902 : : */
903 : 336 : data = page_address(page) + (i * blocksize);
904 : 336 : bitmap = bh[group - first_group]->b_data;
905 : :
906 : : /*
907 : : * We place the buddy block and bitmap block
908 : : * close together
909 : : */
910 [ + + ]: 336 : if ((first_block + i) & 1) {
911 : : /* this is block of buddy */
912 [ - + ]: 168 : BUG_ON(incore == NULL);
913 : : mb_debug(1, "put buddy for group %u in page %lu/%x\n",
914 : : group, page->index, i * blocksize);
915 : : trace_ext4_mb_buddy_bitmap_load(sb, group);
916 : : grinfo = ext4_get_group_info(sb, group);
917 : 168 : grinfo->bb_fragments = 0;
918 [ + - ]: 168 : memset(grinfo->bb_counters, 0,
919 : : sizeof(*grinfo->bb_counters) *
920 : : (sb->s_blocksize_bits+2));
921 : : /*
922 : : * incore got set to the group block bitmap below
923 : : */
924 : : ext4_lock_group(sb, group);
925 : : /* init the buddy */
926 [ + - ]: 168 : memset(data, 0xff, blocksize);
927 : 168 : ext4_mb_generate_buddy(sb, data, incore, group);
928 : : ext4_unlock_group(sb, group);
929 : : incore = NULL;
930 : : } else {
931 : : /* this is block of bitmap */
932 [ - + ]: 168 : BUG_ON(incore != NULL);
933 : : mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
934 : : group, page->index, i * blocksize);
935 : : trace_ext4_mb_bitmap_load(sb, group);
936 : :
937 : : /* see comments in ext4_mb_put_pa() */
938 : : ext4_lock_group(sb, group);
939 : 168 : memcpy(data, bitmap, blocksize);
940 : :
941 : : /* mark all preallocated blks used in in-core bitmap */
942 : 168 : ext4_mb_generate_from_pa(sb, data, group);
943 : 168 : ext4_mb_generate_from_freelist(sb, data, group);
944 : : ext4_unlock_group(sb, group);
945 : :
946 : : /* set incore so that the buddy information can be
947 : : * generated using this
948 : : */
949 : : incore = data;
950 : : }
951 : : }
952 : : SetPageUptodate(page);
953 : :
954 : : out:
955 [ + - ]: 336 : if (bh) {
956 [ + + ]: 672 : for (i = 0; i < groups_per_page; i++)
957 : 336 : brelse(bh[i]);
958 [ - + ]: 336 : if (bh != &bhs)
959 : 0 : kfree(bh);
960 : : }
961 : 336 : return err;
962 : : }
963 : :
964 : : /*
965 : : * Lock the buddy and bitmap pages. This make sure other parallel init_group
966 : : * on the same buddy page doesn't happen whild holding the buddy page lock.
967 : : * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
968 : : * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
969 : : */
970 : 0 : static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
971 : : ext4_group_t group, struct ext4_buddy *e4b)
972 : : {
973 : 105 : struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
974 : : int block, pnum, poff;
975 : : int blocks_per_page;
976 : : struct page *page;
977 : :
978 : 105 : e4b->bd_buddy_page = NULL;
979 : 105 : e4b->bd_bitmap_page = NULL;
980 : :
981 : 105 : blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
982 : : /*
983 : : * the buddy cache inode stores the block bitmap
984 : : * and buddy information in consecutive blocks.
985 : : * So for each group we need two blocks.
986 : : */
987 : 105 : block = group * 2;
988 : 105 : pnum = block / blocks_per_page;
989 : 105 : poff = block % blocks_per_page;
990 : 105 : page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
991 [ + - ]: 105 : if (!page)
992 : : return -EIO;
993 [ - + ]: 105 : BUG_ON(page->mapping != inode->i_mapping);
994 : 105 : e4b->bd_bitmap_page = page;
995 : 105 : e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
996 : :
997 [ + - ]: 105 : if (blocks_per_page >= 2) {
998 : : /* buddy and bitmap are on the same page */
999 : : return 0;
1000 : : }
1001 : :
1002 : 105 : block++;
1003 : 105 : pnum = block / blocks_per_page;
1004 : 105 : page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1005 [ + ]: 105 : if (!page)
1006 : : return -EIO;
1007 [ - + ]: 210 : BUG_ON(page->mapping != inode->i_mapping);
1008 : 105 : e4b->bd_buddy_page = page;
1009 : 105 : return 0;
1010 : : }
1011 : :
1012 : 105 : static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1013 : : {
1014 [ + - ]: 105 : if (e4b->bd_bitmap_page) {
1015 : 105 : unlock_page(e4b->bd_bitmap_page);
1016 : 105 : page_cache_release(e4b->bd_bitmap_page);
1017 : : }
1018 [ + - ]: 105 : if (e4b->bd_buddy_page) {
1019 : 105 : unlock_page(e4b->bd_buddy_page);
1020 : 105 : page_cache_release(e4b->bd_buddy_page);
1021 : : }
1022 : 105 : }
1023 : :
1024 : : /*
1025 : : * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1026 : : * block group lock of all groups for this page; do not hold the BG lock when
1027 : : * calling this routine!
1028 : : */
1029 : : static noinline_for_stack
1030 : 0 : int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1031 : : {
1032 : :
1033 : : struct ext4_group_info *this_grp;
1034 : : struct ext4_buddy e4b;
1035 : : struct page *page;
1036 : : int ret = 0;
1037 : :
1038 : : might_sleep();
1039 : : mb_debug(1, "init group %u\n", group);
1040 : : this_grp = ext4_get_group_info(sb, group);
1041 : : /*
1042 : : * This ensures that we don't reinit the buddy cache
1043 : : * page which map to the group from which we are already
1044 : : * allocating. If we are looking at the buddy cache we would
1045 : : * have taken a reference using ext4_mb_load_buddy and that
1046 : : * would have pinned buddy page to page cache.
1047 : : */
1048 : 105 : ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
1049 [ + - ][ + - ]: 105 : if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1050 : : /*
1051 : : * somebody initialized the group
1052 : : * return without doing anything
1053 : : */
1054 : : goto err;
1055 : : }
1056 : :
1057 : 105 : page = e4b.bd_bitmap_page;
1058 : 105 : ret = ext4_mb_init_cache(page, NULL);
1059 [ + - ]: 105 : if (ret)
1060 : : goto err;
1061 [ + - ]: 105 : if (!PageUptodate(page)) {
1062 : : ret = -EIO;
1063 : : goto err;
1064 : : }
1065 : 105 : mark_page_accessed(page);
1066 : :
1067 [ + - ]: 105 : if (e4b.bd_buddy_page == NULL) {
1068 : : /*
1069 : : * If both the bitmap and buddy are in
1070 : : * the same page we don't need to force
1071 : : * init the buddy
1072 : : */
1073 : : ret = 0;
1074 : : goto err;
1075 : : }
1076 : : /* init buddy cache */
1077 : : page = e4b.bd_buddy_page;
1078 : 105 : ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
1079 [ + - ]: 105 : if (ret)
1080 : : goto err;
1081 [ + - ]: 105 : if (!PageUptodate(page)) {
1082 : : ret = -EIO;
1083 : : goto err;
1084 : : }
1085 : 105 : mark_page_accessed(page);
1086 : : err:
1087 : 105 : ext4_mb_put_buddy_page_lock(&e4b);
1088 : 105 : return ret;
1089 : : }
1090 : :
1091 : : /*
1092 : : * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1093 : : * block group lock of all groups for this page; do not hold the BG lock when
1094 : : * calling this routine!
1095 : : */
1096 : : static noinline_for_stack int
1097 : 0 : ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1098 : : struct ext4_buddy *e4b)
1099 : : {
1100 : : int blocks_per_page;
1101 : : int block;
1102 : : int pnum;
1103 : : int poff;
1104 : : struct page *page;
1105 : : int ret;
1106 : : struct ext4_group_info *grp;
1107 : : struct ext4_sb_info *sbi = EXT4_SB(sb);
1108 : 280513 : struct inode *inode = sbi->s_buddy_cache;
1109 : :
1110 : : might_sleep();
1111 : : mb_debug(1, "load group %u\n", group);
1112 : :
1113 : 280513 : blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1114 : : grp = ext4_get_group_info(sb, group);
1115 : :
1116 : 280513 : e4b->bd_blkbits = sb->s_blocksize_bits;
1117 : 280513 : e4b->bd_info = grp;
1118 : 280513 : e4b->bd_sb = sb;
1119 : 280513 : e4b->bd_group = group;
1120 : 280513 : e4b->bd_buddy_page = NULL;
1121 : 280513 : e4b->bd_bitmap_page = NULL;
1122 : :
1123 [ + + ]: 280513 : if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1124 : : /*
1125 : : * we need full data about the group
1126 : : * to make a good selection
1127 : : */
1128 : 67 : ret = ext4_mb_init_group(sb, group);
1129 [ + - ]: 67 : if (ret)
1130 : : return ret;
1131 : : }
1132 : :
1133 : : /*
1134 : : * the buddy cache inode stores the block bitmap
1135 : : * and buddy information in consecutive blocks.
1136 : : * So for each group we need two blocks.
1137 : : */
1138 : 280513 : block = group * 2;
1139 : 280513 : pnum = block / blocks_per_page;
1140 : 280513 : poff = block % blocks_per_page;
1141 : :
1142 : : /* we could use find_or_create_page(), but it locks page
1143 : : * what we'd like to avoid in fast path ... */
1144 : 280513 : page = find_get_page(inode->i_mapping, pnum);
1145 [ + + ][ + ]: 841573 : if (page == NULL || !PageUptodate(page)) {
1146 [ + + ]: 64 : if (page)
1147 : : /*
1148 : : * drop the page reference and try
1149 : : * to get the page with lock. If we
1150 : : * are not uptodate that implies
1151 : : * somebody just created the page but
1152 : : * is yet to initialize the same. So
1153 : : * wait for it to initialize.
1154 : : */
1155 : 1 : page_cache_release(page);
1156 : 64 : page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1157 [ + - ]: 64 : if (page) {
1158 [ - + ]: 64 : BUG_ON(page->mapping != inode->i_mapping);
1159 [ + + ]: 64 : if (!PageUptodate(page)) {
1160 : 63 : ret = ext4_mb_init_cache(page, NULL);
1161 [ - + ]: 63 : if (ret) {
1162 : 0 : unlock_page(page);
1163 : 0 : goto err;
1164 : : }
1165 : 63 : mb_cmp_bitmaps(e4b, page_address(page) +
1166 : : (poff * sb->s_blocksize));
1167 : : }
1168 : 64 : unlock_page(page);
1169 : : }
1170 : : }
1171 [ + + ][ + - ]: 841527 : if (page == NULL || !PageUptodate(page)) {
1172 : : ret = -EIO;
1173 : : goto err;
1174 : : }
1175 : 280495 : e4b->bd_bitmap_page = page;
1176 : 280495 : e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1177 : 280340 : mark_page_accessed(page);
1178 : :
1179 : 280368 : block++;
1180 : 280368 : pnum = block / blocks_per_page;
1181 : 280368 : poff = block % blocks_per_page;
1182 : :
1183 : 280368 : page = find_get_page(inode->i_mapping, pnum);
1184 [ + + ][ + + ]: 561115 : if (page == NULL || !PageUptodate(page)) {
1185 [ + + ]: 64 : if (page)
1186 : 1 : page_cache_release(page);
1187 : 64 : page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1188 [ + - ]: 64 : if (page) {
1189 [ - + ]: 64 : BUG_ON(page->mapping != inode->i_mapping);
1190 [ + + ]: 64 : if (!PageUptodate(page)) {
1191 : 63 : ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
1192 [ - + ]: 63 : if (ret) {
1193 : 0 : unlock_page(page);
1194 : 0 : goto err;
1195 : : }
1196 : : }
1197 : 64 : unlock_page(page);
1198 : : }
1199 : : }
1200 [ + - ][ + - ]: 561005 : if (page == NULL || !PageUptodate(page)) {
1201 : : ret = -EIO;
1202 : : goto err;
1203 : : }
1204 : 280467 : e4b->bd_buddy_page = page;
1205 : 280467 : e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1206 : 280504 : mark_page_accessed(page);
1207 : :
1208 [ - + ]: 280458 : BUG_ON(e4b->bd_bitmap_page == NULL);
1209 [ - + ]: 280458 : BUG_ON(e4b->bd_buddy_page == NULL);
1210 : :
1211 : : return 0;
1212 : :
1213 : : err:
1214 [ # # ]: 280513 : if (page)
1215 : 0 : page_cache_release(page);
1216 [ # # ]: 0 : if (e4b->bd_bitmap_page)
1217 : 0 : page_cache_release(e4b->bd_bitmap_page);
1218 [ # # ]: 0 : if (e4b->bd_buddy_page)
1219 : 0 : page_cache_release(e4b->bd_buddy_page);
1220 : 0 : e4b->bd_buddy = NULL;
1221 : 0 : e4b->bd_bitmap = NULL;
1222 : 0 : return ret;
1223 : : }
1224 : :
1225 : 280626 : static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1226 : : {
1227 [ + + ]: 280626 : if (e4b->bd_bitmap_page)
1228 : 280625 : page_cache_release(e4b->bd_bitmap_page);
1229 [ + - ]: 280561 : if (e4b->bd_buddy_page)
1230 : 280561 : page_cache_release(e4b->bd_buddy_page);
1231 : 280596 : }
1232 : :
1233 : :
1234 : 0 : static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1235 : : {
1236 : : int order = 1;
1237 : : void *bb;
1238 : :
1239 [ - + ]: 1163650 : BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1240 [ + - ]: 1163650 : BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1241 : :
1242 : : bb = e4b->bd_buddy;
1243 [ + + ]: 8140293 : while (order <= e4b->bd_blkbits + 1) {
1244 : 7724753 : block = block >> 1;
1245 [ + + ]: 7724753 : if (!mb_test_bit(block, bb)) {
1246 : : /* this block is part of buddy of order 'order' */
1247 : : return order;
1248 : : }
1249 : 6976643 : bb += 1 << (e4b->bd_blkbits - order);
1250 : 6976643 : order++;
1251 : : }
1252 : : return 0;
1253 : : }
1254 : :
1255 : 0 : static void mb_clear_bits(void *bm, int cur, int len)
1256 : : {
1257 : : __u32 *addr;
1258 : :
1259 : 110164 : len = cur + len;
1260 [ + + ]: 311194 : while (cur < len) {
1261 [ + + ][ + + ]: 201030 : if ((cur & 31) == 0 && (len - cur) >= 32) {
1262 : : /* fast path: clear whole word at once */
1263 : 18748 : addr = bm + (cur >> 3);
1264 : 18748 : *addr = 0;
1265 : 18748 : cur += 32;
1266 : 18748 : continue;
1267 : : }
1268 : : mb_clear_bit(cur, bm);
1269 : 201030 : cur++;
1270 : : }
1271 : 110164 : }
1272 : :
1273 : : /* clear bits in given range
1274 : : * will return first found zero bit if any, -1 otherwise
1275 : : */
1276 : 0 : static int mb_test_and_clear_bits(void *bm, int cur, int len)
1277 : : {
1278 : : __u32 *addr;
1279 : : int zero_bit = -1;
1280 : :
1281 : 150951 : len = cur + len;
1282 [ + + ]: 912271 : while (cur < len) {
1283 [ + + ][ + + ]: 761265 : if ((cur & 31) == 0 && (len - cur) >= 32) {
1284 : : /* fast path: clear whole word at once */
1285 : 39963 : addr = bm + (cur >> 3);
1286 [ - ][ # # ]: 39963 : if (*addr != (__u32)(-1) && zero_bit == -1)
1287 : 0 : zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1288 : 40018 : *addr = 0;
1289 : 40018 : cur += 32;
1290 : 40018 : continue;
1291 : : }
1292 [ - + ][ # # ]: 721302 : if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1293 : : zero_bit = cur;
1294 : 761320 : cur++;
1295 : : }
1296 : :
1297 : 151006 : return zero_bit;
1298 : : }
1299 : :
1300 : 0 : void ext4_set_bits(void *bm, int cur, int len)
1301 : : {
1302 : : __u32 *addr;
1303 : :
1304 : 260923 : len = cur + len;
1305 [ + + ]: 960533 : while (cur < len) {
1306 [ + + ][ + + ]: 699610 : if ((cur & 31) == 0 && (len - cur) >= 32) {
1307 : : /* fast path: set whole word at once */
1308 : 68087 : addr = bm + (cur >> 3);
1309 : 68087 : *addr = 0xffffffff;
1310 : 68087 : cur += 32;
1311 : 68087 : continue;
1312 : : }
1313 : : mb_set_bit(cur, bm);
1314 : 699610 : cur++;
1315 : : }
1316 : 260923 : }
1317 : :
1318 : : /*
1319 : : * _________________________________________________________________ */
1320 : :
1321 : : static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1322 : : {
1323 [ + + ][ + + ]: 371473 : if (mb_test_bit(*bit + side, bitmap)) {
1324 : : mb_clear_bit(*bit, bitmap);
1325 : 198567 : (*bit) -= side;
1326 : : return 1;
1327 : : }
1328 : : else {
1329 : : (*bit) += side;
1330 : : mb_set_bit(*bit, bitmap);
1331 : : return -1;
1332 : : }
1333 : : }
1334 : :
1335 : 0 : static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1336 : : {
1337 : : int max;
1338 : : int order = 1;
1339 : 129400 : void *buddy = mb_find_buddy(e4b, order, &max);
1340 : :
1341 [ + + ]: 376627 : while (buddy) {
1342 : : void *buddy2;
1343 : :
1344 : : /* Bits in range [first; last] are known to be set since
1345 : : * corresponding blocks were allocated. Bits in range
1346 : : * (first; last) will stay set because they form buddies on
1347 : : * upper layer. We just deal with borders if they don't
1348 : : * align with upper layer and then go up.
1349 : : * Releasing entire group is all about clearing
1350 : : * single bit of highest order buddy.
1351 : : */
1352 : :
1353 : : /* Example:
1354 : : * ---------------------------------
1355 : : * | 1 | 1 | 1 | 1 |
1356 : : * ---------------------------------
1357 : : * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1358 : : * ---------------------------------
1359 : : * 0 1 2 3 4 5 6 7
1360 : : * \_____________________/
1361 : : *
1362 : : * Neither [1] nor [6] is aligned to above layer.
1363 : : * Left neighbour [0] is free, so mark it busy,
1364 : : * decrease bb_counters and extend range to
1365 : : * [0; 6]
1366 : : * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1367 : : * mark [6] free, increase bb_counters and shrink range to
1368 : : * [0; 5].
1369 : : * Then shift range to [0; 2], go up and do the same.
1370 : : */
1371 : :
1372 : :
1373 [ + + ]: 376586 : if (first & 1)
1374 : 185558 : e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1375 [ + + ]: 376586 : if (!(last & 1))
1376 : 185915 : e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1377 [ + ]: 376586 : if (first > last)
1378 : : break;
1379 : 247309 : order++;
1380 : :
1381 [ + ][ + + ]: 247309 : if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
1382 : 91 : mb_clear_bits(buddy, first, last - first + 1);
1383 : 94 : e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1384 : 94 : break;
1385 : : }
1386 : 247218 : first >>= 1;
1387 : 247218 : last >>= 1;
1388 : : buddy = buddy2;
1389 : : }
1390 : 12 : }
1391 : :
1392 : 0 : static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1393 : : int first, int count)
1394 : : {
1395 : : int left_is_free = 0;
1396 : : int right_is_free = 0;
1397 : : int block;
1398 : 151009 : int last = first + count - 1;
1399 : 302012 : struct super_block *sb = e4b->bd_sb;
1400 : :
1401 [ - + ]: 151009 : BUG_ON(last >= (sb->s_blocksize << 3));
1402 [ - + ]: 151009 : assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1403 : : /* Don't bother if the block group is corrupt. */
1404 [ + ]: 151009 : if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1405 : 150990 : return;
1406 : :
1407 : : mb_check_buddy(e4b);
1408 : : mb_free_blocks_double(inode, e4b, first, count);
1409 : :
1410 : 151010 : e4b->bd_info->bb_free += count;
1411 [ + + ]: 151010 : if (first < e4b->bd_info->bb_first_free)
1412 : 1038 : e4b->bd_info->bb_first_free = first;
1413 : :
1414 : : /* access memory sequentially: check left neighbour,
1415 : : * clear range and then check right neighbour
1416 : : */
1417 [ + + ]: 151010 : if (first != 0)
1418 : 150540 : left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1419 : 151010 : block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1420 [ + + ]: 151003 : if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1421 : 150954 : right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1422 : :
1423 [ - + ]: 151003 : if (unlikely(block != -1)) {
1424 : : ext4_fsblk_t blocknr;
1425 : :
1426 : 0 : blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1427 : 0 : blocknr += EXT4_C2B(EXT4_SB(sb), block);
1428 [ # # ]: 0 : ext4_grp_locked_error(sb, e4b->bd_group,
1429 : : inode ? inode->i_ino : 0,
1430 : : blocknr,
1431 : : "freeing already freed block "
1432 : : "(bit %u); block bitmap corrupt.",
1433 : : block);
1434 : : /* Mark the block group as corrupt. */
1435 : 0 : set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
1436 : : &e4b->bd_info->bb_state);
1437 : 0 : mb_regenerate_buddy(e4b);
1438 : 0 : goto done;
1439 : : }
1440 : :
1441 : : /* let's maintain fragments counter */
1442 [ + + ]: 151003 : if (left_is_free && right_is_free)
1443 : 61801 : e4b->bd_info->bb_fragments--;
1444 [ + + ]: 89202 : else if (!left_is_free && !right_is_free)
1445 : 64225 : e4b->bd_info->bb_fragments++;
1446 : :
1447 : : /* buddy[0] == bd_bitmap is a special case, so handle
1448 : : * it right away and let mb_buddy_mark_free stay free of
1449 : : * zero order checks.
1450 : : * Check if neighbours are to be coaleasced,
1451 : : * adjust bitmap bb_counters and borders appropriately.
1452 : : */
1453 [ + + ]: 151003 : if (first & 1) {
1454 : 74112 : first += !left_is_free;
1455 [ + + ]: 74112 : e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1456 : : }
1457 [ + + ]: 151003 : if (!(last & 1)) {
1458 : 74250 : last -= !right_is_free;
1459 [ + + ]: 74250 : e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1460 : : }
1461 : :
1462 [ + ]: 151003 : if (first <= last)
1463 : 129395 : mb_buddy_mark_free(e4b, first >> 1, last >> 1);
1464 : :
1465 : : done:
1466 : 150991 : mb_set_largest_free_order(sb, e4b->bd_info);
1467 : : mb_check_buddy(e4b);
1468 : : }
1469 : :
1470 : 0 : static int mb_find_extent(struct ext4_buddy *e4b, int block,
1471 : : int needed, struct ext4_free_extent *ex)
1472 : : {
1473 : : int next = block;
1474 : : int max, order;
1475 : : void *buddy;
1476 : :
1477 [ - + ]: 553703 : assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1478 [ - + ]: 553703 : BUG_ON(ex == NULL);
1479 : :
1480 : 553703 : buddy = mb_find_buddy(e4b, 0, &max);
1481 [ - + ]: 1107408 : BUG_ON(buddy == NULL);
1482 [ - + ]: 553705 : BUG_ON(block >= max);
1483 [ + + ]: 553705 : if (mb_test_bit(block, buddy)) {
1484 : 1503 : ex->fe_len = 0;
1485 : 1503 : ex->fe_start = 0;
1486 : 1503 : ex->fe_group = 0;
1487 : 1503 : return 0;
1488 : : }
1489 : :
1490 : : /* find actual order */
1491 : 552202 : order = mb_find_order_for_block(e4b, block);
1492 : 552223 : block = block >> order;
1493 : :
1494 : 552223 : ex->fe_len = 1 << order;
1495 : 552223 : ex->fe_start = block << order;
1496 : 552223 : ex->fe_group = e4b->bd_group;
1497 : :
1498 : : /* calc difference from given start */
1499 : 552223 : next = next - ex->fe_start;
1500 : 552223 : ex->fe_len -= next;
1501 : 552223 : ex->fe_start += next;
1502 : :
1503 [ + + + ]: 1696472 : while (needed > ex->fe_len &&
1504 : 735530 : mb_find_buddy(e4b, order, &max)) {
1505 : :
1506 [ + + ]: 735534 : if (block + 1 >= max)
1507 : : break;
1508 : :
1509 : 735529 : next = (block + 1) * (1 << order);
1510 [ + + ]: 735529 : if (mb_test_bit(next, e4b->bd_bitmap))
1511 : : break;
1512 : :
1513 : 408727 : order = mb_find_order_for_block(e4b, next);
1514 : :
1515 : 408725 : block = next >> order;
1516 : 408725 : ex->fe_len += 1 << order;
1517 : : }
1518 : :
1519 [ - + ]: 552215 : BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
1520 : : return ex->fe_len;
1521 : : }
1522 : :
1523 : 0 : static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1524 : : {
1525 : : int ord;
1526 : : int mlen = 0;
1527 : 66316 : int max = 0;
1528 : : int cur;
1529 : 66316 : int start = ex->fe_start;
1530 : 66316 : int len = ex->fe_len;
1531 : : unsigned ret = 0;
1532 : : int len0 = len;
1533 : : void *buddy;
1534 : :
1535 [ - + ]: 66316 : BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1536 [ - + ]: 66316 : BUG_ON(e4b->bd_group != ex->fe_group);
1537 [ - + ]: 66316 : assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1538 : : mb_check_buddy(e4b);
1539 : : mb_mark_used_double(e4b, start, len);
1540 : :
1541 : 66316 : e4b->bd_info->bb_free -= len;
1542 [ + + ]: 66316 : if (e4b->bd_info->bb_first_free == start)
1543 : 7749 : e4b->bd_info->bb_first_free += len;
1544 : :
1545 : : /* let's maintain fragments counter */
1546 [ + + ]: 66316 : if (start != 0)
1547 : 65867 : mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
1548 [ + + ]: 66316 : if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1549 : 66292 : max = !mb_test_bit(start + len, e4b->bd_bitmap);
1550 [ + + ]: 66316 : if (mlen && max)
1551 : 66316 : e4b->bd_info->bb_fragments++;
1552 [ + + ]: 61314 : else if (!mlen && !max)
1553 : 7396 : e4b->bd_info->bb_fragments--;
1554 : :
1555 : : /* let's maintain buddy itself */
1556 [ + + ]: 269133 : while (len) {
1557 : 202815 : ord = mb_find_order_for_block(e4b, start);
1558 : :
1559 [ + + ][ + + ]: 202822 : if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1560 : : /* the whole chunk may be allocated at once! */
1561 : : mlen = 1 << ord;
1562 : 115987 : buddy = mb_find_buddy(e4b, ord, &max);
1563 [ - + ]: 182298 : BUG_ON((start >> ord) >= max);
1564 : : mb_set_bit(start >> ord, buddy);
1565 : 115982 : e4b->bd_info->bb_counters[ord]--;
1566 : 115982 : start += mlen;
1567 : 115982 : len -= mlen;
1568 [ - + ]: 115982 : BUG_ON(len < 0);
1569 : 115982 : continue;
1570 : : }
1571 : :
1572 : : /* store for history */
1573 [ + + ]: 86835 : if (ret == 0)
1574 : 33202 : ret = len | (ord << 16);
1575 : :
1576 : : /* we have to split large buddy */
1577 [ - + ]: 86835 : BUG_ON(ord <= 0);
1578 : 86835 : buddy = mb_find_buddy(e4b, ord, &max);
1579 : : mb_set_bit(start >> ord, buddy);
1580 : 86835 : e4b->bd_info->bb_counters[ord]--;
1581 : :
1582 : 86835 : ord--;
1583 : 86835 : cur = (start >> ord) & ~1U;
1584 : 86835 : buddy = mb_find_buddy(e4b, ord, &max);
1585 : : mb_clear_bit(cur, buddy);
1586 : 86835 : mb_clear_bit(cur + 1, buddy);
1587 : 86835 : e4b->bd_info->bb_counters[ord]++;
1588 : 202817 : e4b->bd_info->bb_counters[ord]++;
1589 : : }
1590 : 66318 : mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1591 : :
1592 : 66318 : ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
1593 : : mb_check_buddy(e4b);
1594 : :
1595 : 66325 : return ret;
1596 : : }
1597 : :
1598 : : /*
1599 : : * Must be called under group lock!
1600 : : */
1601 : 0 : static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1602 : : struct ext4_buddy *e4b)
1603 : : {
1604 : 66319 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1605 : : int ret;
1606 : :
1607 [ - + ]: 66319 : BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1608 [ - + ]: 66319 : BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1609 : :
1610 : 66319 : ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1611 : 66319 : ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1612 : 66319 : ret = mb_mark_used(e4b, &ac->ac_b_ex);
1613 : :
1614 : : /* preallocation can change ac_b_ex, thus we store actually
1615 : : * allocated blocks for history */
1616 : 66325 : ac->ac_f_ex = ac->ac_b_ex;
1617 : :
1618 : 66325 : ac->ac_status = AC_STATUS_FOUND;
1619 : 66325 : ac->ac_tail = ret & 0xffff;
1620 : 66325 : ac->ac_buddy = ret >> 16;
1621 : :
1622 : : /*
1623 : : * take the page reference. We want the page to be pinned
1624 : : * so that we don't get a ext4_mb_init_cache_call for this
1625 : : * group until we update the bitmap. That would mean we
1626 : : * double allocate blocks. The reference is dropped
1627 : : * in ext4_mb_release_context
1628 : : */
1629 : 66325 : ac->ac_bitmap_page = e4b->bd_bitmap_page;
1630 : : get_page(ac->ac_bitmap_page);
1631 : 66322 : ac->ac_buddy_page = e4b->bd_buddy_page;
1632 : : get_page(ac->ac_buddy_page);
1633 : : /* store last allocated for subsequent stream allocation */
1634 [ + + ]: 66322 : if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1635 : : spin_lock(&sbi->s_md_lock);
1636 : 25659 : sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1637 : 25659 : sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1638 : : spin_unlock(&sbi->s_md_lock);
1639 : : }
1640 : 66322 : }
1641 : :
1642 : : /*
1643 : : * regular allocator, for general purposes allocation
1644 : : */
1645 : :
1646 : 0 : static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1647 : : struct ext4_buddy *e4b,
1648 : : int finish_group)
1649 : : {
1650 : 494654 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1651 : : struct ext4_free_extent *bex = &ac->ac_b_ex;
1652 : : struct ext4_free_extent *gex = &ac->ac_g_ex;
1653 : : struct ext4_free_extent ex;
1654 : : int max;
1655 : :
1656 [ + + ]: 494654 : if (ac->ac_status == AC_STATUS_FOUND)
1657 : 355026 : return;
1658 : : /*
1659 : : * We don't want to scan for a whole year
1660 : : */
1661 [ + + ][ + - ]: 461507 : if (ac->ac_found > sbi->s_mb_max_to_scan &&
1662 : 2 : !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1663 : 2 : ac->ac_status = AC_STATUS_BREAK;
1664 : 2 : return;
1665 : : }
1666 : :
1667 : : /*
1668 : : * Haven't found good chunk so far, let's continue
1669 : : */
1670 [ + + ]: 461505 : if (bex->fe_len < gex->fe_len)
1671 : : return;
1672 : :
1673 [ + + ][ + + ]: 167065 : if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1674 [ + - ]: 27437 : && bex->fe_group == e4b->bd_group) {
1675 : : /* recheck chunk's availability - we don't know
1676 : : * when it was found (within this lock-unlock
1677 : : * period or not) */
1678 : 27437 : max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
1679 [ + - ]: 27437 : if (max >= gex->fe_len) {
1680 : 167065 : ext4_mb_use_best_found(ac, e4b);
1681 : 27437 : return;
1682 : : }
1683 : : }
1684 : : }
1685 : :
1686 : : /*
1687 : : * The routine checks whether found extent is good enough. If it is,
1688 : : * then the extent gets marked used and flag is set to the context
1689 : : * to stop scanning. Otherwise, the extent is compared with the
1690 : : * previous found extent and if new one is better, then it's stored
1691 : : * in the context. Later, the best found extent will be used, if
1692 : : * mballoc can't find good enough extent.
1693 : : *
1694 : : * FIXME: real allocation policy is to be designed yet!
1695 : : */
1696 : 0 : static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1697 : : struct ext4_free_extent *ex,
1698 : : struct ext4_buddy *e4b)
1699 : : {
1700 : : struct ext4_free_extent *bex = &ac->ac_b_ex;
1701 : : struct ext4_free_extent *gex = &ac->ac_g_ex;
1702 : :
1703 [ - + ]: 501369 : BUG_ON(ex->fe_len <= 0);
1704 [ - + ]: 501369 : BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1705 [ - + ]: 501369 : BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1706 [ - + ]: 501369 : BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1707 : :
1708 : 501369 : ac->ac_found++;
1709 : :
1710 : : /*
1711 : : * The special case - take what you catch first
1712 : : */
1713 [ - + ]: 501369 : if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1714 : 0 : *bex = *ex;
1715 : 0 : ext4_mb_use_best_found(ac, e4b);
1716 : 0 : return;
1717 : : }
1718 : :
1719 : : /*
1720 : : * Let's check whether the chuck is good enough
1721 : : */
1722 [ + + ]: 501369 : if (ex->fe_len == gex->fe_len) {
1723 : 24177 : *bex = *ex;
1724 : 24177 : ext4_mb_use_best_found(ac, e4b);
1725 : 24177 : return;
1726 : : }
1727 : :
1728 : : /*
1729 : : * If this is first found extent, just store it in the context
1730 : : */
1731 [ + + ]: 477192 : if (bex->fe_len == 0) {
1732 : 34153 : *bex = *ex;
1733 : 34153 : return;
1734 : : }
1735 : :
1736 : : /*
1737 : : * If new found extent is better, store it in the context
1738 : : */
1739 [ + + ]: 443039 : if (bex->fe_len < gex->fe_len) {
1740 : : /* if the request isn't satisfied, any found extent
1741 : : * larger than previous best one is better */
1742 [ + + ]: 301953 : if (ex->fe_len > bex->fe_len)
1743 : 39066 : *bex = *ex;
1744 [ + + ]: 141086 : } else if (ex->fe_len > gex->fe_len) {
1745 : : /* if the request is satisfied, then we try to find
1746 : : * an extent that still satisfy the request, but is
1747 : : * smaller than previous one */
1748 [ + + ]: 131266 : if (ex->fe_len < bex->fe_len)
1749 : 5161 : *bex = *ex;
1750 : : }
1751 : :
1752 : 443039 : ext4_mb_check_limits(ac, e4b, 0);
1753 : : }
1754 : :
1755 : : static noinline_for_stack
1756 : 0 : int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1757 : 1 : struct ext4_buddy *e4b)
1758 : : {
1759 : 1 : struct ext4_free_extent ex = ac->ac_b_ex;
1760 : 1 : ext4_group_t group = ex.fe_group;
1761 : : int max;
1762 : : int err;
1763 : :
1764 [ - + ]: 1 : BUG_ON(ex.fe_len <= 0);
1765 : 1 : err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1766 [ + - ]: 1 : if (err)
1767 : : return err;
1768 : :
1769 : 1 : ext4_lock_group(ac->ac_sb, group);
1770 : 1 : max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
1771 : :
1772 [ + - ]: 1 : if (max > 0) {
1773 : 1 : ac->ac_b_ex = ex;
1774 : 1 : ext4_mb_use_best_found(ac, e4b);
1775 : : }
1776 : :
1777 : 1 : ext4_unlock_group(ac->ac_sb, group);
1778 : 1 : ext4_mb_unload_buddy(e4b);
1779 : :
1780 : 1 : return 0;
1781 : : }
1782 : :
1783 : : static noinline_for_stack
1784 : 0 : int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1785 : 24937 : struct ext4_buddy *e4b)
1786 : : {
1787 : 66127 : ext4_group_t group = ac->ac_g_ex.fe_group;
1788 : : int max;
1789 : : int err;
1790 : 66127 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1791 : : struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1792 : : struct ext4_free_extent ex;
1793 : :
1794 [ + + ]: 66127 : if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1795 : : return 0;
1796 [ + + ]: 24953 : if (grp->bb_free == 0)
1797 : : return 0;
1798 : :
1799 : 24936 : err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1800 [ + - ]: 24937 : if (err)
1801 : : return err;
1802 : :
1803 [ - + ]: 24937 : if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
1804 : 0 : ext4_mb_unload_buddy(e4b);
1805 : 0 : return 0;
1806 : : }
1807 : :
1808 : 24937 : ext4_lock_group(ac->ac_sb, group);
1809 : 24936 : max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1810 : : ac->ac_g_ex.fe_len, &ex);
1811 : :
1812 [ + + ][ + ]: 91064 : if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1813 : : ext4_fsblk_t start;
1814 : :
1815 : 132254 : start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1816 : 66127 : ex.fe_start;
1817 : : /* use do_div to get remainder (would be 64-bit modulo) */
1818 [ - + ][ # # ]: 66127 : if (do_div(start, sbi->s_stripe) == 0) {
[ - + ][ - + ]
[ - + ][ - + ]
[ - + ][ - + ]
[ - + ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1819 : 0 : ac->ac_found++;
1820 : 0 : ac->ac_b_ex = ex;
1821 : 0 : ext4_mb_use_best_found(ac, e4b);
1822 : : }
1823 [ + + ]: 24937 : } else if (max >= ac->ac_g_ex.fe_len) {
1824 [ - + ]: 10996 : BUG_ON(ex.fe_len <= 0);
1825 [ - + ]: 10996 : BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1826 [ - + ]: 10996 : BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1827 : 10996 : ac->ac_found++;
1828 : 10996 : ac->ac_b_ex = ex;
1829 : 10996 : ext4_mb_use_best_found(ac, e4b);
1830 [ + + ][ - + ]: 13941 : } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1831 : : /* Sometimes, caller may want to merge even small
1832 : : * number of blocks to an existing extent */
1833 [ # # ]: 0 : BUG_ON(ex.fe_len <= 0);
1834 [ # # ]: 0 : BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1835 [ # # ]: 0 : BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1836 : 0 : ac->ac_found++;
1837 : 0 : ac->ac_b_ex = ex;
1838 : 0 : ext4_mb_use_best_found(ac, e4b);
1839 : : }
1840 : 24937 : ext4_unlock_group(ac->ac_sb, group);
1841 : 24937 : ext4_mb_unload_buddy(e4b);
1842 : :
1843 : 24936 : return 0;
1844 : : }
1845 : :
1846 : : /*
1847 : : * The routine scans buddy structures (not bitmap!) from given order
1848 : : * to max order and tries to find big enough chunk to satisfy the req
1849 : : */
1850 : : static noinline_for_stack
1851 : 0 : void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1852 : : struct ext4_buddy *e4b)
1853 : : {
1854 : 7418 : struct super_block *sb = ac->ac_sb;
1855 : 3709 : struct ext4_group_info *grp = e4b->bd_info;
1856 : : void *buddy;
1857 : : int i;
1858 : : int k;
1859 : : int max;
1860 : :
1861 [ - + ]: 3709 : BUG_ON(ac->ac_2order <= 0);
1862 [ + - ]: 4515 : for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1863 [ + + ]: 4515 : if (grp->bb_counters[i] == 0)
1864 : 806 : continue;
1865 : :
1866 : 3709 : buddy = mb_find_buddy(e4b, i, &max);
1867 [ - + ]: 3709 : BUG_ON(buddy == NULL);
1868 : :
1869 : 3709 : k = mb_find_next_zero_bit(buddy, max, 0);
1870 [ - + ]: 3709 : BUG_ON(k >= max);
1871 : :
1872 : 3709 : ac->ac_found++;
1873 : :
1874 : 3709 : ac->ac_b_ex.fe_len = 1 << i;
1875 : 3709 : ac->ac_b_ex.fe_start = k << i;
1876 : 3709 : ac->ac_b_ex.fe_group = e4b->bd_group;
1877 : :
1878 : 3709 : ext4_mb_use_best_found(ac, e4b);
1879 : :
1880 [ - + ]: 3709 : BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1881 : :
1882 [ - + ]: 3709 : if (EXT4_SB(sb)->s_mb_stats)
1883 : 0 : atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
1884 : :
1885 : : break;
1886 : : }
1887 : 3709 : }
1888 : :
1889 : : /*
1890 : : * The routine scans the group and measures all found extents.
1891 : : * In order to optimize scanning, caller must pass number of
1892 : : * free blocks in the group, so the routine can know upper limit.
1893 : : */
1894 : : static noinline_for_stack
1895 : 0 : void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1896 : : struct ext4_buddy *e4b)
1897 : : {
1898 : 1054256 : struct super_block *sb = ac->ac_sb;
1899 : 51576 : void *bitmap = e4b->bd_bitmap;
1900 : : struct ext4_free_extent ex;
1901 : : int i;
1902 : : int free;
1903 : :
1904 : 51576 : free = e4b->bd_info->bb_free;
1905 [ - + ]: 51576 : BUG_ON(free <= 0);
1906 : :
1907 : 51576 : i = e4b->bd_info->bb_first_free;
1908 : :
1909 [ + + ][ + + ]: 552929 : while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1910 : 501315 : i = mb_find_next_zero_bit(bitmap,
1911 : 501315 : EXT4_CLUSTERS_PER_GROUP(sb), i);
1912 [ - + ]: 501365 : if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
1913 : : /*
1914 : : * IF we have corrupt bitmap, we won't find any
1915 : : * free blocks even though group info says we
1916 : : * we have free blocks
1917 : : */
1918 : 0 : ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1919 : : "%d free clusters as per "
1920 : : "group info. But bitmap says 0",
1921 : : free);
1922 : 0 : break;
1923 : : }
1924 : :
1925 : 501365 : mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
1926 [ - + ]: 501358 : BUG_ON(ex.fe_len <= 0);
1927 [ - + ]: 501358 : if (free < ex.fe_len) {
1928 : 0 : ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1929 : : "%d free clusters as per "
1930 : : "group info. But got %d blocks",
1931 : : free, ex.fe_len);
1932 : : /*
1933 : : * The number of free blocks differs. This mostly
1934 : : * indicate that the bitmap is corrupt. So exit
1935 : : * without claiming the space.
1936 : : */
1937 : 0 : break;
1938 : : }
1939 : :
1940 : 501358 : ext4_mb_measure_extent(ac, &ex, e4b);
1941 : :
1942 : 501353 : i += ex.fe_len;
1943 : 501353 : free -= ex.fe_len;
1944 : : }
1945 : :
1946 : 51614 : ext4_mb_check_limits(ac, e4b, 1);
1947 : 51619 : }
1948 : :
1949 : : /*
1950 : : * This is a special case for storages like raid5
1951 : : * we try to find stripe-aligned chunks for stripe-size-multiple requests
1952 : : */
1953 : : static noinline_for_stack
1954 : 0 : void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1955 : : struct ext4_buddy *e4b)
1956 : : {
1957 : 0 : struct super_block *sb = ac->ac_sb;
1958 : : struct ext4_sb_info *sbi = EXT4_SB(sb);
1959 : 0 : void *bitmap = e4b->bd_bitmap;
1960 : : struct ext4_free_extent ex;
1961 : : ext4_fsblk_t first_group_block;
1962 : : ext4_fsblk_t a;
1963 : : ext4_grpblk_t i;
1964 : : int max;
1965 : :
1966 [ # # ]: 0 : BUG_ON(sbi->s_stripe == 0);
1967 : :
1968 : : /* find first stripe-aligned block in group */
1969 : 0 : first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
1970 : :
1971 : 0 : a = first_group_block + sbi->s_stripe - 1;
1972 [ # # ][ # # ]: 0 : do_div(a, sbi->s_stripe);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ]
1973 : 0 : i = (a * sbi->s_stripe) - first_group_block;
1974 : :
1975 [ # # ]: 0 : while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
1976 [ # # ]: 0 : if (!mb_test_bit(i, bitmap)) {
1977 : 0 : max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
1978 [ # # ]: 0 : if (max >= sbi->s_stripe) {
1979 : 0 : ac->ac_found++;
1980 : 0 : ac->ac_b_ex = ex;
1981 : 0 : ext4_mb_use_best_found(ac, e4b);
1982 : 0 : break;
1983 : : }
1984 : : }
1985 : 0 : i += sbi->s_stripe;
1986 : : }
1987 : 0 : }
1988 : :
1989 : : /* This is now called BEFORE we load the buddy bitmap. */
1990 : 0 : static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1991 : : ext4_group_t group, int cr)
1992 : : {
1993 : : unsigned free, fragments;
1994 : 230513 : int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1995 : : struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1996 : :
1997 [ - + ]: 230513 : BUG_ON(cr < 0 || cr >= 4);
1998 : :
1999 : 230513 : free = grp->bb_free;
2000 [ + + ]: 230513 : if (free == 0)
2001 : : return 0;
2002 [ + + ][ + + ]: 111611 : if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2003 : : return 0;
2004 : :
2005 [ + - ]: 111432 : if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2006 : : return 0;
2007 : :
2008 : : /* We only do this if the grp has never been initialized */
2009 [ + + ]: 111432 : if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2010 : 38 : int ret = ext4_mb_init_group(ac->ac_sb, group);
2011 [ + ]: 38 : if (ret)
2012 : : return 0;
2013 : : }
2014 : :
2015 : 111389 : fragments = grp->bb_fragments;
2016 [ + - ]: 111389 : if (fragments == 0)
2017 : : return 0;
2018 : :
2019 [ + + - - ]: 111389 : switch (cr) {
2020 : : case 0:
2021 [ - + ]: 7781 : BUG_ON(ac->ac_2order == 0);
2022 : :
2023 : : /* Avoid using the first bg of a flexgroup for data files */
2024 [ + - ][ + - ]: 7781 : if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
2025 [ + + ]: 7781 : (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
2026 : 7781 : ((group % flex_size) == 0))
2027 : : return 0;
2028 : :
2029 [ + - ][ + + ]: 7517 : if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
2030 : 7517 : (free / fragments) >= ac->ac_g_ex.fe_len)
2031 : : return 1;
2032 : :
2033 [ + + ]: 260 : if (grp->bb_largest_free_order < ac->ac_2order)
2034 : : return 0;
2035 : :
2036 : 160 : return 1;
2037 : : case 1:
2038 [ + + ]: 103686 : if ((free / fragments) >= ac->ac_g_ex.fe_len)
2039 : : return 1;
2040 : : break;
2041 : : case 2:
2042 [ # # ]: 0 : if (free >= ac->ac_g_ex.fe_len)
2043 : : return 1;
2044 : : break;
2045 : : case 3:
2046 : : return 1;
2047 : : default:
2048 : 0 : BUG();
2049 : : }
2050 : :
2051 : 471 : return 0;
2052 : : }
2053 : :
2054 : : static noinline_for_stack int
2055 : 0 : ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2056 : : {
2057 : : ext4_group_t ngroups, group, i;
2058 : : int cr;
2059 : : int err = 0;
2060 : : struct ext4_sb_info *sbi;
2061 : 66204 : struct super_block *sb;
2062 : : struct ext4_buddy e4b;
2063 : :
2064 : 66204 : sb = ac->ac_sb;
2065 : : sbi = EXT4_SB(sb);
2066 : : ngroups = ext4_get_groups_count(sb);
2067 : : /* non-extent files are limited to low blocks/groups */
2068 [ - + ]: 66130 : if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
2069 : 0 : ngroups = sbi->s_blockfile_groups;
2070 : :
2071 [ - + ]: 66130 : BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2072 : :
2073 : : /* first, try the goal */
2074 : 66130 : err = ext4_mb_find_by_goal(ac, &e4b);
2075 [ + ][ + + ]: 66184 : if (err || ac->ac_status == AC_STATUS_FOUND)
2076 : : goto out;
2077 : :
2078 [ + + ]: 55259 : if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2079 : : goto out;
2080 : :
2081 : : /*
2082 : : * ac->ac2_order is set only if the fe_len is a power of 2
2083 : : * if ac2_order is set we also set criteria to 0 so that we
2084 : : * try exact allocation using buddy.
2085 : : */
2086 : 44128 : i = fls(ac->ac_g_ex.fe_len);
2087 : 0 : ac->ac_2order = 0;
2088 : : /*
2089 : : * We search using buddy data only if the order of the request
2090 : : * is greater than equal to the sbi_s_mb_order2_reqs
2091 : : * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
2092 : : */
2093 [ + ]: 55166 : if (i >= sbi->s_mb_order2_reqs) {
2094 : : /*
2095 : : * This should tell if fe_len is exactly power of 2
2096 : : */
2097 [ + + ]: 15190 : if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2098 : 3709 : ac->ac_2order = i - 1;
2099 : : }
2100 : :
2101 : : /* if stream allocation is enabled, use global goal */
2102 [ + ]: 0 : if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2103 : : /* TBD: may be hot point */
2104 : : spin_lock(&sbi->s_md_lock);
2105 : 14663 : ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2106 : 14663 : ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2107 : : spin_unlock(&sbi->s_md_lock);
2108 : : }
2109 : :
2110 : : /* Let's just scan groups to find more-less suitable blocks */
2111 : 55162 : cr = ac->ac_2order ? 0 : 1;
2112 : : /*
2113 : : * cr == 0 try to get exact allocation,
2114 : : * cr == 3 try to get anything
2115 : : */
2116 : : repeat:
2117 [ + ][ + + ]: 110488 : for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2118 : 55320 : ac->ac_criteria = cr;
2119 : : /*
2120 : : * searching for the right group start
2121 : : * from the goal value specified
2122 : : */
2123 : 55320 : group = ac->ac_g_ex.fe_group;
2124 : :
2125 [ + + ]: 175306 : for (i = 0; i < ngroups; group++, i++) {
2126 : 175226 : cond_resched();
2127 : : /*
2128 : : * Artificially restricted ngroups for non-extent
2129 : : * files makes group > ngroups possible on first loop.
2130 : : */
2131 [ - + ]: 175286 : if (group >= ngroups)
2132 : : group = 0;
2133 : :
2134 : : /* This now checks without needing the buddy page */
2135 [ + + ]: 175286 : if (!ext4_mb_good_group(ac, group, cr))
2136 : 119906 : continue;
2137 : :
2138 : 55302 : err = ext4_mb_load_buddy(sb, group, &e4b);
2139 [ + ]: 55188 : if (err)
2140 : : goto out;
2141 : :
2142 : : ext4_lock_group(sb, group);
2143 : :
2144 : : /*
2145 : : * We need to check again after locking the
2146 : : * block group
2147 : : */
2148 [ - + ]: 55303 : if (!ext4_mb_good_group(ac, group, cr)) {
2149 : : ext4_unlock_group(sb, group);
2150 : 0 : ext4_mb_unload_buddy(&e4b);
2151 : 0 : continue;
2152 : : }
2153 : :
2154 : 55311 : ac->ac_groups_scanned++;
2155 [ + + ][ + - ]: 55311 : if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
2156 : 3709 : ext4_mb_simple_scan_group(ac, &e4b);
2157 [ + + ][ - + ]: 51602 : else if (cr == 1 && sbi->s_stripe &&
[ # # ]
2158 : 0 : !(ac->ac_g_ex.fe_len % sbi->s_stripe))
2159 : 0 : ext4_mb_scan_aligned(ac, &e4b);
2160 : : else
2161 : 51602 : ext4_mb_complex_scan_group(ac, &e4b);
2162 : :
2163 : : ext4_unlock_group(sb, group);
2164 : 55330 : ext4_mb_unload_buddy(&e4b);
2165 : :
2166 [ + + ]: 55328 : if (ac->ac_status != AC_STATUS_CONTINUE)
2167 : : break;
2168 : : }
2169 : : }
2170 : :
2171 [ + ][ + + ]: 55168 : if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
[ + - ]
2172 : 1 : !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2173 : : /*
2174 : : * We've been searching too long. Let's try to allocate
2175 : : * the best chunk we've found so far
2176 : : */
2177 : :
2178 : 1 : ext4_mb_try_best_found(ac, &e4b);
2179 [ - + ]: 1 : if (ac->ac_status != AC_STATUS_FOUND) {
2180 : : /*
2181 : : * Someone more lucky has already allocated it.
2182 : : * The only thing we can do is just take first
2183 : : * found block(s)
2184 : : printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
2185 : : */
2186 : 0 : ac->ac_b_ex.fe_group = 0;
2187 : 0 : ac->ac_b_ex.fe_start = 0;
2188 : 0 : ac->ac_b_ex.fe_len = 0;
2189 : 0 : ac->ac_status = AC_STATUS_CONTINUE;
2190 : 0 : ac->ac_flags |= EXT4_MB_HINT_FIRST;
2191 : : cr = 3;
2192 : 0 : atomic_inc(&sbi->s_mb_lost_chunks);
2193 : : goto repeat;
2194 : : }
2195 : : }
2196 : : out:
2197 : 66144 : return err;
2198 : : }
2199 : :
2200 : 0 : static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2201 : : {
2202 : 15 : struct super_block *sb = seq->private;
2203 : : ext4_group_t group;
2204 : :
2205 [ + - + + ]: 30 : if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2206 : : return NULL;
2207 : 13 : group = *pos + 1;
2208 : 13 : return (void *) ((unsigned long) group);
2209 : : }
2210 : :
2211 : 0 : static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2212 : : {
2213 : 103 : struct super_block *sb = seq->private;
2214 : : ext4_group_t group;
2215 : :
2216 : 103 : ++*pos;
2217 [ + - + - ]: 206 : if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2218 : : return NULL;
2219 : 103 : group = *pos + 1;
2220 : 103 : return (void *) ((unsigned long) group);
2221 : : }
2222 : :
2223 : 0 : static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2224 : : {
2225 : 116 : struct super_block *sb = seq->private;
2226 : 116 : ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2227 : : int i;
2228 : : int err, buddy_loaded = 0;
2229 : : struct ext4_buddy e4b;
2230 : : struct ext4_group_info *grinfo;
2231 : : struct sg {
2232 : : struct ext4_group_info info;
2233 : : ext4_grpblk_t counters[16];
2234 : : } sg;
2235 : :
2236 : 116 : group--;
2237 [ + + ]: 116 : if (group == 0)
2238 : 1 : seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
2239 : : "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
2240 : : "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
2241 : : "group", "free", "frags", "first",
2242 : : "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
2243 : : "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
2244 : :
2245 : 116 : i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2246 : : sizeof(struct ext4_group_info);
2247 : : grinfo = ext4_get_group_info(sb, group);
2248 : : /* Load the group info in memory only if not already loaded. */
2249 [ + + ]: 116 : if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
2250 : 62 : err = ext4_mb_load_buddy(sb, group, &e4b);
2251 [ - + ]: 62 : if (err) {
2252 : 0 : seq_printf(seq, "#%-5u: I/O error\n", group);
2253 : 0 : return 0;
2254 : : }
2255 : : buddy_loaded = 1;
2256 : : }
2257 : :
2258 : 116 : memcpy(&sg, ext4_get_group_info(sb, group), i);
2259 : :
2260 [ + + ]: 116 : if (buddy_loaded)
2261 : 62 : ext4_mb_unload_buddy(&e4b);
2262 : :
2263 : 116 : seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2264 : : sg.info.bb_fragments, sg.info.bb_first_free);
2265 [ + + ]: 1740 : for (i = 0; i <= 13; i++)
2266 [ + - ]: 1624 : seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
2267 : : sg.info.bb_counters[i] : 0);
2268 : 116 : seq_printf(seq, " ]\n");
2269 : :
2270 : 116 : return 0;
2271 : : }
2272 : :
2273 : 0 : static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2274 : : {
2275 : 15 : }
2276 : :
2277 : : static const struct seq_operations ext4_mb_seq_groups_ops = {
2278 : : .start = ext4_mb_seq_groups_start,
2279 : : .next = ext4_mb_seq_groups_next,
2280 : : .stop = ext4_mb_seq_groups_stop,
2281 : : .show = ext4_mb_seq_groups_show,
2282 : : };
2283 : :
2284 : 0 : static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2285 : : {
2286 : 1 : struct super_block *sb = PDE_DATA(inode);
2287 : : int rc;
2288 : :
2289 : 1 : rc = seq_open(file, &ext4_mb_seq_groups_ops);
2290 [ + - ]: 1 : if (rc == 0) {
2291 : 1 : struct seq_file *m = file->private_data;
2292 : 1 : m->private = sb;
2293 : : }
2294 : 0 : return rc;
2295 : :
2296 : : }
2297 : :
2298 : : static const struct file_operations ext4_mb_seq_groups_fops = {
2299 : : .owner = THIS_MODULE,
2300 : : .open = ext4_mb_seq_groups_open,
2301 : : .read = seq_read,
2302 : : .llseek = seq_lseek,
2303 : : .release = seq_release,
2304 : : };
2305 : :
2306 : : static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2307 : : {
2308 : 0 : int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2309 : 0 : struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2310 : :
2311 [ # # ][ # # ]: 0 : BUG_ON(!cachep);
[ # # ]
2312 : : return cachep;
2313 : : }
2314 : :
2315 : : /*
2316 : : * Allocate the top-level s_group_info array for the specified number
2317 : : * of groups
2318 : : */
2319 : 0 : int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
2320 : : {
2321 : : struct ext4_sb_info *sbi = EXT4_SB(sb);
2322 : : unsigned size;
2323 : : struct ext4_group_info ***new_groupinfo;
2324 : :
2325 : 0 : size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
2326 : 0 : EXT4_DESC_PER_BLOCK_BITS(sb);
2327 [ # # ]: 0 : if (size <= sbi->s_group_info_size)
2328 : : return 0;
2329 : :
2330 [ # # ][ # # ]: 0 : size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
2331 : 0 : new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
2332 [ # # ]: 0 : if (!new_groupinfo) {
2333 : 0 : ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2334 : 0 : return -ENOMEM;
2335 : : }
2336 [ # # ]: 0 : if (sbi->s_group_info) {
2337 : 0 : memcpy(new_groupinfo, sbi->s_group_info,
2338 : 0 : sbi->s_group_info_size * sizeof(*sbi->s_group_info));
2339 : 0 : ext4_kvfree(sbi->s_group_info);
2340 : : }
2341 : 0 : sbi->s_group_info = new_groupinfo;
2342 : 0 : sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
2343 : : ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
2344 : : sbi->s_group_info_size);
2345 : 0 : return 0;
2346 : : }
2347 : :
2348 : : /* Create and initialize ext4_group_info data for the given group. */
2349 : 0 : int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2350 : : struct ext4_group_desc *desc)
2351 : : {
2352 : : int i;
2353 : : int metalen = 0;
2354 : : struct ext4_sb_info *sbi = EXT4_SB(sb);
2355 : : struct ext4_group_info **meta_group_info;
2356 : 0 : struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2357 : :
2358 : : /*
2359 : : * First check if this group is the first of a reserved block.
2360 : : * If it's true, we have to allocate a new table of pointers
2361 : : * to ext4_group_info structures
2362 : : */
2363 [ # # ]: 0 : if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2364 : 0 : metalen = sizeof(*meta_group_info) <<
2365 : 0 : EXT4_DESC_PER_BLOCK_BITS(sb);
2366 : : meta_group_info = kmalloc(metalen, GFP_KERNEL);
2367 [ # # ]: 0 : if (meta_group_info == NULL) {
2368 : 0 : ext4_msg(sb, KERN_ERR, "can't allocate mem "
2369 : : "for a buddy group");
2370 : 0 : goto exit_meta_group_info;
2371 : : }
2372 : 0 : sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
2373 : : meta_group_info;
2374 : : }
2375 : :
2376 : 0 : meta_group_info =
2377 : 0 : sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2378 : 0 : i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2379 : :
2380 : 0 : meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
2381 [ # # ]: 0 : if (meta_group_info[i] == NULL) {
2382 : 0 : ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2383 : : goto exit_group_info;
2384 : : }
2385 : 0 : set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2386 : : &(meta_group_info[i]->bb_state));
2387 : :
2388 : : /*
2389 : : * initialize bb_free to be able to skip
2390 : : * empty groups without initialization
2391 : : */
2392 [ # # ]: 0 : if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2393 : 0 : meta_group_info[i]->bb_free =
2394 : 0 : ext4_free_clusters_after_init(sb, group, desc);
2395 : : } else {
2396 : 0 : meta_group_info[i]->bb_free =
2397 : 0 : ext4_free_group_clusters(sb, desc);
2398 : : }
2399 : :
2400 : 0 : INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2401 : 0 : init_rwsem(&meta_group_info[i]->alloc_sem);
2402 : 0 : meta_group_info[i]->bb_free_root = RB_ROOT;
2403 : 0 : meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
2404 : :
2405 : : #ifdef DOUBLE_CHECK
2406 : : {
2407 : : struct buffer_head *bh;
2408 : : meta_group_info[i]->bb_bitmap =
2409 : : kmalloc(sb->s_blocksize, GFP_KERNEL);
2410 : : BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2411 : : bh = ext4_read_block_bitmap(sb, group);
2412 : : BUG_ON(bh == NULL);
2413 : : memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2414 : : sb->s_blocksize);
2415 : : put_bh(bh);
2416 : : }
2417 : : #endif
2418 : :
2419 : 0 : return 0;
2420 : :
2421 : : exit_group_info:
2422 : : /* If a meta_group_info table has been allocated, release it now */
2423 [ # # ]: 0 : if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2424 : 0 : kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
2425 : 0 : sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
2426 : : }
2427 : : exit_meta_group_info:
2428 : : return -ENOMEM;
2429 : : } /* ext4_mb_add_groupinfo */
2430 : :
2431 : 0 : static int ext4_mb_init_backend(struct super_block *sb)
2432 : : {
2433 : : ext4_group_t ngroups = ext4_get_groups_count(sb);
2434 : : ext4_group_t i;
2435 : : struct ext4_sb_info *sbi = EXT4_SB(sb);
2436 : : int err;
2437 : : struct ext4_group_desc *desc;
2438 : : struct kmem_cache *cachep;
2439 : :
2440 : 0 : err = ext4_mb_alloc_groupinfo(sb, ngroups);
2441 [ # # ]: 0 : if (err)
2442 : : return err;
2443 : :
2444 : 0 : sbi->s_buddy_cache = new_inode(sb);
2445 [ # # ]: 0 : if (sbi->s_buddy_cache == NULL) {
2446 : 0 : ext4_msg(sb, KERN_ERR, "can't get new inode");
2447 : 0 : goto err_freesgi;
2448 : : }
2449 : : /* To avoid potentially colliding with an valid on-disk inode number,
2450 : : * use EXT4_BAD_INO for the buddy cache inode number. This inode is
2451 : : * not in the inode hash, so it should never be found by iget(), but
2452 : : * this will avoid confusion if it ever shows up during debugging. */
2453 : 0 : sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
2454 : 0 : EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2455 [ # # ]: 0 : for (i = 0; i < ngroups; i++) {
2456 : 0 : desc = ext4_get_group_desc(sb, i, NULL);
2457 [ # # ]: 0 : if (desc == NULL) {
2458 : 0 : ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
2459 : 0 : goto err_freebuddy;
2460 : : }
2461 [ # # ]: 0 : if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2462 : : goto err_freebuddy;
2463 : : }
2464 : :
2465 : : return 0;
2466 : :
2467 : : err_freebuddy:
2468 : 0 : cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2469 [ # # ]: 0 : while (i-- > 0)
2470 : 0 : kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2471 : 0 : i = sbi->s_group_info_size;
2472 [ # # ]: 0 : while (i-- > 0)
2473 : 0 : kfree(sbi->s_group_info[i]);
2474 : 0 : iput(sbi->s_buddy_cache);
2475 : : err_freesgi:
2476 : 0 : ext4_kvfree(sbi->s_group_info);
2477 : 0 : return -ENOMEM;
2478 : : }
2479 : :
2480 : 0 : static void ext4_groupinfo_destroy_slabs(void)
2481 : : {
2482 : : int i;
2483 : :
2484 [ # # ]: 0 : for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2485 [ # # ]: 0 : if (ext4_groupinfo_caches[i])
2486 : 0 : kmem_cache_destroy(ext4_groupinfo_caches[i]);
2487 : 0 : ext4_groupinfo_caches[i] = NULL;
2488 : : }
2489 : 0 : }
2490 : :
2491 : 0 : static int ext4_groupinfo_create_slab(size_t size)
2492 : : {
2493 : : static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2494 : : int slab_size;
2495 [ # # ][ # # ]: 0 : int blocksize_bits = order_base_2(size);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
2496 : 0 : int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2497 : : struct kmem_cache *cachep;
2498 : :
2499 [ # # ]: 0 : if (cache_index >= NR_GRPINFO_CACHES)
2500 : : return -EINVAL;
2501 : :
2502 [ # # ]: 0 : if (unlikely(cache_index < 0))
2503 : : cache_index = 0;
2504 : :
2505 : 0 : mutex_lock(&ext4_grpinfo_slab_create_mutex);
2506 [ # # ]: 0 : if (ext4_groupinfo_caches[cache_index]) {
2507 : 0 : mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2508 : 0 : return 0; /* Already created */
2509 : : }
2510 : :
2511 : 0 : slab_size = offsetof(struct ext4_group_info,
2512 : : bb_counters[blocksize_bits + 2]);
2513 : :
2514 : 0 : cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2515 : : slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2516 : : NULL);
2517 : :
2518 : 0 : ext4_groupinfo_caches[cache_index] = cachep;
2519 : :
2520 : 0 : mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2521 [ # # ]: 0 : if (!cachep) {
2522 : 0 : printk(KERN_EMERG
2523 : : "EXT4-fs: no memory for groupinfo slab cache\n");
2524 : 0 : return -ENOMEM;
2525 : : }
2526 : :
2527 : : return 0;
2528 : : }
2529 : :
2530 : 0 : int ext4_mb_init(struct super_block *sb)
2531 : : {
2532 : : struct ext4_sb_info *sbi = EXT4_SB(sb);
2533 : : unsigned i, j;
2534 : : unsigned offset;
2535 : : unsigned max;
2536 : : int ret;
2537 : :
2538 : 0 : i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2539 : :
2540 : 0 : sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2541 [ # # ]: 0 : if (sbi->s_mb_offsets == NULL) {
2542 : : ret = -ENOMEM;
2543 : : goto out;
2544 : : }
2545 : :
2546 : 0 : i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2547 : 0 : sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2548 [ # # ]: 0 : if (sbi->s_mb_maxs == NULL) {
2549 : : ret = -ENOMEM;
2550 : : goto out;
2551 : : }
2552 : :
2553 : 0 : ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2554 [ # # ]: 0 : if (ret < 0)
2555 : : goto out;
2556 : :
2557 : : /* order 0 is regular bitmap */
2558 : 0 : sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2559 : 0 : sbi->s_mb_offsets[0] = 0;
2560 : :
2561 : : i = 1;
2562 : : offset = 0;
2563 : 0 : max = sb->s_blocksize << 2;
2564 : : do {
2565 : 0 : sbi->s_mb_offsets[i] = offset;
2566 : 0 : sbi->s_mb_maxs[i] = max;
2567 : 0 : offset += 1 << (sb->s_blocksize_bits - i);
2568 : 0 : max = max >> 1;
2569 : 0 : i++;
2570 [ # # ]: 0 : } while (i <= sb->s_blocksize_bits + 1);
2571 : :
2572 : 0 : spin_lock_init(&sbi->s_md_lock);
2573 : 0 : spin_lock_init(&sbi->s_bal_lock);
2574 : :
2575 : 0 : sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2576 : 0 : sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2577 : 0 : sbi->s_mb_stats = MB_DEFAULT_STATS;
2578 : 0 : sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2579 : 0 : sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2580 : : /*
2581 : : * The default group preallocation is 512, which for 4k block
2582 : : * sizes translates to 2 megabytes. However for bigalloc file
2583 : : * systems, this is probably too big (i.e, if the cluster size
2584 : : * is 1 megabyte, then group preallocation size becomes half a
2585 : : * gigabyte!). As a default, we will keep a two megabyte
2586 : : * group pralloc size for cluster sizes up to 64k, and after
2587 : : * that, we will force a minimum group preallocation size of
2588 : : * 32 clusters. This translates to 8 megs when the cluster
2589 : : * size is 256k, and 32 megs when the cluster size is 1 meg,
2590 : : * which seems reasonable as a default.
2591 : : */
2592 : 0 : sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
2593 : : sbi->s_cluster_bits, 32);
2594 : : /*
2595 : : * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
2596 : : * to the lowest multiple of s_stripe which is bigger than
2597 : : * the s_mb_group_prealloc as determined above. We want
2598 : : * the preallocation size to be an exact multiple of the
2599 : : * RAID stripe size so that preallocations don't fragment
2600 : : * the stripes.
2601 : : */
2602 [ # # ]: 0 : if (sbi->s_stripe > 1) {
2603 : 0 : sbi->s_mb_group_prealloc = roundup(
2604 : : sbi->s_mb_group_prealloc, sbi->s_stripe);
2605 : : }
2606 : :
2607 : 0 : sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2608 [ # # ]: 0 : if (sbi->s_locality_groups == NULL) {
2609 : : ret = -ENOMEM;
2610 : : goto out_free_groupinfo_slab;
2611 : : }
2612 [ # # ]: 0 : for_each_possible_cpu(i) {
2613 : : struct ext4_locality_group *lg;
2614 : 0 : lg = per_cpu_ptr(sbi->s_locality_groups, i);
2615 : 0 : mutex_init(&lg->lg_mutex);
2616 [ # # ]: 0 : for (j = 0; j < PREALLOC_TB_SIZE; j++)
2617 : 0 : INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
2618 : 0 : spin_lock_init(&lg->lg_prealloc_lock);
2619 : : }
2620 : :
2621 : : /* init file for buddy data */
2622 : 0 : ret = ext4_mb_init_backend(sb);
2623 [ # # ]: 0 : if (ret != 0)
2624 : : goto out_free_locality_groups;
2625 : :
2626 [ # # ]: 0 : if (sbi->s_proc)
2627 : 0 : proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2628 : : &ext4_mb_seq_groups_fops, sb);
2629 : :
2630 : : return 0;
2631 : :
2632 : : out_free_locality_groups:
2633 : 0 : free_percpu(sbi->s_locality_groups);
2634 : 0 : sbi->s_locality_groups = NULL;
2635 : : out_free_groupinfo_slab:
2636 : 0 : ext4_groupinfo_destroy_slabs();
2637 : : out:
2638 : 0 : kfree(sbi->s_mb_offsets);
2639 : 0 : sbi->s_mb_offsets = NULL;
2640 : 0 : kfree(sbi->s_mb_maxs);
2641 : 0 : sbi->s_mb_maxs = NULL;
2642 : 0 : return ret;
2643 : : }
2644 : :
2645 : : /* need to called with the ext4 group lock held */
2646 : 0 : static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2647 : : {
2648 : : struct ext4_prealloc_space *pa;
2649 : : struct list_head *cur, *tmp;
2650 : : int count = 0;
2651 : :
2652 [ # # ]: 0 : list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
2653 : 0 : pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2654 : : list_del(&pa->pa_group_list);
2655 : : count++;
2656 : 0 : kmem_cache_free(ext4_pspace_cachep, pa);
2657 : : }
2658 : : if (count)
2659 : : mb_debug(1, "mballoc: %u PAs left\n", count);
2660 : :
2661 : 0 : }
2662 : :
2663 : 0 : int ext4_mb_release(struct super_block *sb)
2664 : : {
2665 : : ext4_group_t ngroups = ext4_get_groups_count(sb);
2666 : : ext4_group_t i;
2667 : : int num_meta_group_infos;
2668 : : struct ext4_group_info *grinfo;
2669 : : struct ext4_sb_info *sbi = EXT4_SB(sb);
2670 : 0 : struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2671 : :
2672 [ # # ]: 0 : if (sbi->s_proc)
2673 : 0 : remove_proc_entry("mb_groups", sbi->s_proc);
2674 : :
2675 [ # # ]: 0 : if (sbi->s_group_info) {
2676 [ # # ]: 0 : for (i = 0; i < ngroups; i++) {
2677 : : grinfo = ext4_get_group_info(sb, i);
2678 : : #ifdef DOUBLE_CHECK
2679 : : kfree(grinfo->bb_bitmap);
2680 : : #endif
2681 : : ext4_lock_group(sb, i);
2682 : 0 : ext4_mb_cleanup_pa(grinfo);
2683 : : ext4_unlock_group(sb, i);
2684 : 0 : kmem_cache_free(cachep, grinfo);
2685 : : }
2686 : 0 : num_meta_group_infos = (ngroups +
2687 : 0 : EXT4_DESC_PER_BLOCK(sb) - 1) >>
2688 : 0 : EXT4_DESC_PER_BLOCK_BITS(sb);
2689 [ # # ]: 0 : for (i = 0; i < num_meta_group_infos; i++)
2690 : 0 : kfree(sbi->s_group_info[i]);
2691 : 0 : ext4_kvfree(sbi->s_group_info);
2692 : : }
2693 : 0 : kfree(sbi->s_mb_offsets);
2694 : 0 : kfree(sbi->s_mb_maxs);
2695 [ # # ]: 0 : if (sbi->s_buddy_cache)
2696 : 0 : iput(sbi->s_buddy_cache);
2697 [ # # ]: 0 : if (sbi->s_mb_stats) {
2698 : 0 : ext4_msg(sb, KERN_INFO,
2699 : : "mballoc: %u blocks %u reqs (%u success)",
2700 : : atomic_read(&sbi->s_bal_allocated),
2701 : : atomic_read(&sbi->s_bal_reqs),
2702 : : atomic_read(&sbi->s_bal_success));
2703 : 0 : ext4_msg(sb, KERN_INFO,
2704 : : "mballoc: %u extents scanned, %u goal hits, "
2705 : : "%u 2^N hits, %u breaks, %u lost",
2706 : : atomic_read(&sbi->s_bal_ex_scanned),
2707 : : atomic_read(&sbi->s_bal_goals),
2708 : : atomic_read(&sbi->s_bal_2orders),
2709 : : atomic_read(&sbi->s_bal_breaks),
2710 : : atomic_read(&sbi->s_mb_lost_chunks));
2711 : 0 : ext4_msg(sb, KERN_INFO,
2712 : : "mballoc: %lu generated and it took %Lu",
2713 : : sbi->s_mb_buddies_generated,
2714 : : sbi->s_mb_generation_time);
2715 : 0 : ext4_msg(sb, KERN_INFO,
2716 : : "mballoc: %u preallocated, %u discarded",
2717 : : atomic_read(&sbi->s_mb_preallocated),
2718 : : atomic_read(&sbi->s_mb_discarded));
2719 : : }
2720 : :
2721 : 0 : free_percpu(sbi->s_locality_groups);
2722 : :
2723 : 0 : return 0;
2724 : : }
2725 : :
2726 : 0 : static inline int ext4_issue_discard(struct super_block *sb,
2727 : : ext4_group_t block_group, ext4_grpblk_t cluster, int count)
2728 : : {
2729 : : ext4_fsblk_t discard_block;
2730 : :
2731 : 0 : discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
2732 : : ext4_group_first_block_no(sb, block_group));
2733 : 0 : count = EXT4_C2B(EXT4_SB(sb), count);
2734 : 0 : trace_ext4_discard_blocks(sb,
2735 : : (unsigned long long) discard_block, count);
2736 : : return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2737 : : }
2738 : :
2739 : : /*
2740 : : * This function is called by the jbd2 layer once the commit has finished,
2741 : : * so we know we can free the blocks that were released with that commit.
2742 : : */
2743 : 0 : static void ext4_free_data_callback(struct super_block *sb,
2744 : : struct ext4_journal_cb_entry *jce,
2745 : : int rc)
2746 : : {
2747 : : struct ext4_free_data *entry = (struct ext4_free_data *)jce;
2748 : : struct ext4_buddy e4b;
2749 : : struct ext4_group_info *db;
2750 : : int err, count = 0, count2 = 0;
2751 : :
2752 : : mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2753 : : entry->efd_count, entry->efd_group, entry);
2754 : :
2755 [ - + ]: 68103 : if (test_opt(sb, DISCARD)) {
2756 : 0 : err = ext4_issue_discard(sb, entry->efd_group,
2757 : : entry->efd_start_cluster,
2758 : : entry->efd_count);
2759 [ # # ]: 0 : if (err && err != -EOPNOTSUPP)
2760 : 0 : ext4_msg(sb, KERN_WARNING, "discard request in"
2761 : : " group:%d block:%d count:%d failed"
2762 : : " with %d", entry->efd_group,
2763 : : entry->efd_start_cluster,
2764 : : entry->efd_count, err);
2765 : : }
2766 : :
2767 : 68103 : err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
2768 : : /* we expect to find existing buddy because it's pinned */
2769 [ - + ]: 68103 : BUG_ON(err != 0);
2770 : :
2771 : :
2772 : 68103 : db = e4b.bd_info;
2773 : : /* there are blocks to put in buddy to make them really free */
2774 : : count += entry->efd_count;
2775 : : count2++;
2776 : 68103 : ext4_lock_group(sb, entry->efd_group);
2777 : : /* Take it out of per group rb tree */
2778 : 68103 : rb_erase(&entry->efd_node, &(db->bb_free_root));
2779 : 68103 : mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
2780 : :
2781 : : /*
2782 : : * Clear the trimmed flag for the group so that the next
2783 : : * ext4_trim_fs can trim it.
2784 : : * If the volume is mounted with -o discard, online discard
2785 : : * is supported and the free blocks will be trimmed online.
2786 : : */
2787 [ + - ]: 68103 : if (!test_opt(sb, DISCARD))
2788 : 68103 : EXT4_MB_GRP_CLEAR_TRIMMED(db);
2789 : :
2790 [ + + ]: 68103 : if (!db->bb_free_root.rb_node) {
2791 : : /* No more items in the per group rb tree
2792 : : * balance refcounts from ext4_mb_free_metadata()
2793 : : */
2794 : 3261 : page_cache_release(e4b.bd_buddy_page);
2795 : 3261 : page_cache_release(e4b.bd_bitmap_page);
2796 : : }
2797 : 68103 : ext4_unlock_group(sb, entry->efd_group);
2798 : 68103 : kmem_cache_free(ext4_free_data_cachep, entry);
2799 : 68103 : ext4_mb_unload_buddy(&e4b);
2800 : :
2801 : : mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2802 : 68103 : }
2803 : :
2804 : 0 : int __init ext4_init_mballoc(void)
2805 : : {
2806 : 0 : ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2807 : : SLAB_RECLAIM_ACCOUNT);
2808 [ # # ]: 0 : if (ext4_pspace_cachep == NULL)
2809 : : return -ENOMEM;
2810 : :
2811 : 0 : ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2812 : : SLAB_RECLAIM_ACCOUNT);
2813 [ # # ]: 0 : if (ext4_ac_cachep == NULL) {
2814 : 0 : kmem_cache_destroy(ext4_pspace_cachep);
2815 : 0 : return -ENOMEM;
2816 : : }
2817 : :
2818 : 0 : ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
2819 : : SLAB_RECLAIM_ACCOUNT);
2820 [ # # ]: 0 : if (ext4_free_data_cachep == NULL) {
2821 : 0 : kmem_cache_destroy(ext4_pspace_cachep);
2822 : 0 : kmem_cache_destroy(ext4_ac_cachep);
2823 : 0 : return -ENOMEM;
2824 : : }
2825 : : return 0;
2826 : : }
2827 : :
2828 : 0 : void ext4_exit_mballoc(void)
2829 : : {
2830 : : /*
2831 : : * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2832 : : * before destroying the slab cache.
2833 : : */
2834 : 0 : rcu_barrier();
2835 : 0 : kmem_cache_destroy(ext4_pspace_cachep);
2836 : 0 : kmem_cache_destroy(ext4_ac_cachep);
2837 : 0 : kmem_cache_destroy(ext4_free_data_cachep);
2838 : 0 : ext4_groupinfo_destroy_slabs();
2839 : 0 : }
2840 : :
2841 : :
2842 : : /*
2843 : : * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
2844 : : * Returns 0 if success or error code
2845 : : */
2846 : : static noinline_for_stack int
2847 : 0 : ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2848 : : handle_t *handle, unsigned int reserv_clstrs)
2849 : : {
2850 : : struct buffer_head *bitmap_bh = NULL;
2851 : : struct ext4_group_desc *gdp;
2852 : : struct buffer_head *gdp_bh;
2853 : : struct ext4_sb_info *sbi;
2854 : 194568 : struct super_block *sb;
2855 : : ext4_fsblk_t block;
2856 : : int err, len;
2857 : :
2858 [ - + ]: 194568 : BUG_ON(ac->ac_status != AC_STATUS_FOUND);
2859 [ - + ]: 194568 : BUG_ON(ac->ac_b_ex.fe_len <= 0);
2860 : :
2861 : 194568 : sb = ac->ac_sb;
2862 : : sbi = EXT4_SB(sb);
2863 : :
2864 : : err = -EIO;
2865 : 194568 : bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
2866 [ + - ]: 194571 : if (!bitmap_bh)
2867 : : goto out_err;
2868 : :
2869 : 194571 : err = ext4_journal_get_write_access(handle, bitmap_bh);
2870 [ + ]: 194544 : if (err)
2871 : : goto out_err;
2872 : :
2873 : : err = -EIO;
2874 : 194545 : gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
2875 [ + + ]: 194546 : if (!gdp)
2876 : : goto out_err;
2877 : :
2878 : 194543 : ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2879 : : ext4_free_group_clusters(sb, gdp));
2880 : :
2881 : 194555 : err = ext4_journal_get_write_access(handle, gdp_bh);
2882 [ + - ]: 194587 : if (err)
2883 : : goto out_err;
2884 : :
2885 : 194587 : block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2886 : :
2887 : 194587 : len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
2888 [ - + ]: 194587 : if (!ext4_data_block_valid(sbi, block, len)) {
2889 : 0 : ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2890 : : "fs metadata", block, block+len);
2891 : : /* File system mounted not to panic on error
2892 : : * Fix the bitmap and repeat the block allocation
2893 : : * We leak some of the blocks here.
2894 : : */
2895 : 0 : ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2896 : 0 : ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2897 : : ac->ac_b_ex.fe_len);
2898 : 0 : ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2899 : 0 : err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2900 [ # # ]: 0 : if (!err)
2901 : : err = -EAGAIN;
2902 : : goto out_err;
2903 : : }
2904 : :
2905 : 194571 : ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2906 : : #ifdef AGGRESSIVE_CHECK
2907 : : {
2908 : : int i;
2909 : : for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
2910 : : BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
2911 : : bitmap_bh->b_data));
2912 : : }
2913 : : }
2914 : : #endif
2915 : 194588 : ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2916 : : ac->ac_b_ex.fe_len);
2917 [ + + ]: 194587 : if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2918 : 4 : gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2919 : 4 : ext4_free_group_clusters_set(sb, gdp,
2920 : : ext4_free_clusters_after_init(sb,
2921 : : ac->ac_b_ex.fe_group, gdp));
2922 : : }
2923 : 194587 : len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
2924 : 194590 : ext4_free_group_clusters_set(sb, gdp, len);
2925 : 194584 : ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
2926 : 194585 : ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
2927 : :
2928 : 194588 : ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2929 : 194587 : percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
2930 : : /*
2931 : : * Now reduce the dirty block count also. Should not go negative
2932 : : */
2933 [ + + ]: 194586 : if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2934 : : /* release all the reserved blocks if non delalloc */
2935 : 108077 : percpu_counter_sub(&sbi->s_dirtyclusters_counter,
2936 : : reserv_clstrs);
2937 : :
2938 [ + - ]: 194582 : if (sbi->s_log_groups_per_flex) {
2939 : 194582 : ext4_group_t flex_group = ext4_flex_group(sbi,
2940 : : ac->ac_b_ex.fe_group);
2941 : 389164 : atomic64_sub(ac->ac_b_ex.fe_len,
2942 : 194582 : &sbi->s_flex_groups[flex_group].free_clusters);
2943 : : }
2944 : :
2945 : 194583 : err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2946 [ + ]: 194574 : if (err)
2947 : : goto out_err;
2948 : 194576 : err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
2949 : :
2950 : : out_err:
2951 : : brelse(bitmap_bh);
2952 : 194587 : return err;
2953 : : }
2954 : :
2955 : : /*
2956 : : * here we normalize request for locality group
2957 : : * Group request are normalized to s_mb_group_prealloc, which goes to
2958 : : * s_strip if we set the same via mount option.
2959 : : * s_mb_group_prealloc can be configured via
2960 : : * /sys/fs/ext4/<partition>/mb_group_prealloc
2961 : : *
2962 : : * XXX: should we try to preallocate more than the group has now?
2963 : : */
2964 : 0 : static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
2965 : : {
2966 : 24 : struct super_block *sb = ac->ac_sb;
2967 : 12 : struct ext4_locality_group *lg = ac->ac_lg;
2968 : :
2969 [ - + ]: 12 : BUG_ON(lg == NULL);
2970 : 12 : ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
2971 : : mb_debug(1, "#%u: goal %u blocks for locality group\n",
2972 : : current->pid, ac->ac_g_ex.fe_len);
2973 : 12 : }
2974 : :
2975 : : /*
2976 : : * Normalization means making request better in terms of
2977 : : * size and alignment
2978 : : */
2979 : : static noinline_for_stack void
2980 : 0 : ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2981 : : struct ext4_allocation_request *ar)
2982 : : {
2983 : 336033 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2984 : : int bsbits, max;
2985 : : ext4_lblk_t end;
2986 : : loff_t size, start_off;
2987 : : loff_t orig_size __maybe_unused;
2988 : : ext4_lblk_t start;
2989 : 66226 : struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
2990 : : struct ext4_prealloc_space *pa;
2991 : :
2992 : : /* do normalize only data requests, metadata requests
2993 : : do not need preallocation */
2994 [ + + ]: 66226 : if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
2995 : : return;
2996 : :
2997 : : /* sometime caller may want exact blocks */
2998 [ + ]: 27648 : if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2999 : : return;
3000 : :
3001 : : /* caller may indicate that preallocation isn't
3002 : : * required (it's a tail, for example) */
3003 [ + + ]: 27649 : if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
3004 : : return;
3005 : :
3006 [ + + ]: 25663 : if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
3007 : 12 : ext4_mb_normalize_group_request(ac);
3008 : 12 : return ;
3009 : : }
3010 : :
3011 : 25651 : bsbits = ac->ac_sb->s_blocksize_bits;
3012 : :
3013 : : /* first, let's learn actual file size
3014 : : * given current request is allocated */
3015 : 25651 : size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
3016 : 25654 : size = size << bsbits;
3017 [ + + ]: 25650 : if (size < i_size_read(ac->ac_inode))
3018 : 24902 : size = i_size_read(ac->ac_inode);
3019 : : orig_size = size;
3020 : :
3021 : : /* max size of free chunks */
3022 : 25651 : max = 2 << bsbits;
3023 : :
3024 : : #define NRL_CHECK_SIZE(req, size, max, chunk_size) \
3025 : : (req <= (size) || max <= (chunk_size))
3026 : :
3027 : : /* first, try to predict filesize */
3028 : : /* XXX: should this table be tunable? */
3029 : : start_off = 0;
3030 [ + ]: 91877 : if (size <= 16 * 1024) {
3031 : : size = 16 * 1024;
3032 [ + ]: 25652 : } else if (size <= 32 * 1024) {
3033 : : size = 32 * 1024;
3034 [ + + ]: 25653 : } else if (size <= 64 * 1024) {
3035 : : size = 64 * 1024;
3036 [ + + ]: 25651 : } else if (size <= 128 * 1024) {
3037 : : size = 128 * 1024;
3038 [ + + ]: 25602 : } else if (size <= 256 * 1024) {
3039 : : size = 256 * 1024;
3040 [ + + ]: 25526 : } else if (size <= 512 * 1024) {
3041 : : size = 512 * 1024;
3042 [ + + ]: 25226 : } else if (size <= 1024 * 1024) {
3043 : : size = 1024 * 1024;
3044 [ + + ]: 756 : } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
3045 : 150 : start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3046 : 150 : (21 - bsbits)) << 21;
3047 : : size = 2 * 1024 * 1024;
3048 [ + + ]: 606 : } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
3049 : 26 : start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3050 : 26 : (22 - bsbits)) << 22;
3051 : : size = 4 * 1024 * 1024;
3052 [ - + ][ # # ]: 580 : } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
3053 : : (8<<20)>>bsbits, max, 8 * 1024)) {
3054 : 1160 : start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3055 : 580 : (23 - bsbits)) << 23;
3056 : 580 : size = 8 * 1024 * 1024;
3057 : : } else {
3058 : 0 : start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
3059 : 0 : size = ac->ac_o_ex.fe_len << bsbits;
3060 : : }
3061 : 25651 : size = size >> bsbits;
3062 : 25651 : start = start_off >> bsbits;
3063 : :
3064 : : /* don't cover already allocated blocks in selected range */
3065 [ + + ][ + + ]: 25651 : if (ar->pleft && start <= ar->lleft) {
3066 : 24278 : size -= ar->lleft + 1 - start;
3067 : 24278 : start = ar->lleft + 1;
3068 : : }
3069 [ + + ][ + + ]: 25651 : if (ar->pright && start + size - 1 >= ar->lright)
3070 : 21271 : size -= start + size - ar->lright;
3071 : :
3072 : 25651 : end = start + size;
3073 : :
3074 : : /* check we don't cross already preallocated blocks */
3075 : : rcu_read_lock();
3076 [ + + ]: 147719 : list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3077 : : ext4_lblk_t pa_end;
3078 : :
3079 [ - + ]: 122066 : if (pa->pa_deleted)
3080 : 0 : continue;
3081 : : spin_lock(&pa->pa_lock);
3082 [ - + ]: 122075 : if (pa->pa_deleted) {
3083 : : spin_unlock(&pa->pa_lock);
3084 : 0 : continue;
3085 : : }
3086 : :
3087 : 244150 : pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3088 : : pa->pa_len);
3089 : :
3090 : : /* PA must not overlap original request */
3091 [ + + ][ - + ]: 122075 : BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3092 : : ac->ac_o_ex.fe_logical < pa->pa_lstart));
3093 : :
3094 : : /* skip PAs this normalized request doesn't overlap with */
3095 [ + + ][ + + ]: 122075 : if (pa->pa_lstart >= end || pa_end <= start) {
3096 : : spin_unlock(&pa->pa_lock);
3097 : 122043 : continue;
3098 : : }
3099 [ + - ][ - + ]: 35 : BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3100 : :
3101 : : /* adjust start or end to be adjacent to this pa */
3102 [ + - ]: 35 : if (pa_end <= ac->ac_o_ex.fe_logical) {
3103 [ - + ]: 35 : BUG_ON(pa_end < start);
3104 : : start = pa_end;
3105 [ # # ]: 0 : } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3106 [ # # ]: 0 : BUG_ON(pa->pa_lstart > end);
3107 : : end = pa->pa_lstart;
3108 : : }
3109 : : spin_unlock(&pa->pa_lock);
3110 : : }
3111 : : rcu_read_unlock();
3112 : 25653 : size = end - start;
3113 : :
3114 : : /* XXX: extra loop to check we really don't overlap preallocations */
3115 : : rcu_read_lock();
3116 [ + + ]: 147729 : list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3117 : : ext4_lblk_t pa_end;
3118 : :
3119 : : spin_lock(&pa->pa_lock);
3120 [ + - ]: 122079 : if (pa->pa_deleted == 0) {
3121 : 244158 : pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3122 : : pa->pa_len);
3123 [ + + ][ - + ]: 122079 : BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3124 : : }
3125 : : spin_unlock(&pa->pa_lock);
3126 : : }
3127 : : rcu_read_unlock();
3128 : :
3129 [ - + ][ # # ]: 25653 : if (start + size <= ac->ac_o_ex.fe_logical &&
3130 : : start > ac->ac_o_ex.fe_logical) {
3131 : 0 : ext4_msg(ac->ac_sb, KERN_ERR,
3132 : : "start %lu, size %lu, fe_logical %lu",
3133 : : (unsigned long) start, (unsigned long) size,
3134 : : (unsigned long) ac->ac_o_ex.fe_logical);
3135 : : }
3136 [ - + ][ # # ]: 25653 : BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3137 : : start > ac->ac_o_ex.fe_logical);
3138 [ + - ][ + + ]: 25653 : BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
3139 : :
3140 : : /* now prepare goal request */
3141 : :
3142 : : /* XXX: is it better to align blocks WRT to logical
3143 : : * placement or satisfy big request as is */
3144 : 25652 : ac->ac_g_ex.fe_logical = start;
3145 : 25652 : ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
3146 : :
3147 : : /* define goal start in order to merge */
3148 [ + + ][ + + ]: 25652 : if (ar->pright && (ar->lright == (start + size))) {
3149 : : /* merge to the right */
3150 : 21273 : ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3151 : : &ac->ac_f_ex.fe_group,
3152 : : &ac->ac_f_ex.fe_start);
3153 : 21273 : ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3154 : : }
3155 [ + + ][ + + ]: 25652 : if (ar->pleft && (ar->lleft + 1 == start)) {
3156 : : /* merge to the left */
3157 : 24578 : ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3158 : : &ac->ac_f_ex.fe_group,
3159 : : &ac->ac_f_ex.fe_start);
3160 : 24577 : ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3161 : : }
3162 : :
3163 : : mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3164 : : (unsigned) orig_size, (unsigned) start);
3165 : : }
3166 : :
3167 : 0 : static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3168 : : {
3169 : 194567 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3170 : :
3171 [ - + ][ # # ]: 194567 : if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3172 : 0 : atomic_inc(&sbi->s_bal_reqs);
3173 : 0 : atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
3174 [ # # ]: 0 : if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
3175 : 0 : atomic_inc(&sbi->s_bal_success);
3176 : 0 : atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3177 [ # # ][ # # ]: 0 : if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3178 : 0 : ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3179 : 0 : atomic_inc(&sbi->s_bal_goals);
3180 [ - ]: 0 : if (ac->ac_found > sbi->s_mb_max_to_scan)
3181 : 0 : atomic_inc(&sbi->s_bal_breaks);
3182 : : }
3183 : :
3184 [ + + ]: 194551 : if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3185 : : trace_ext4_mballoc_alloc(ac);
3186 : : else
3187 : : trace_ext4_mballoc_prealloc(ac);
3188 : 0 : }
3189 : :
3190 : : /*
3191 : : * Called on failure; free up any blocks from the inode PA for this
3192 : : * context. We don't need this for MB_GROUP_PA because we only change
3193 : : * pa_free in ext4_mb_release_context(), but on failure, we've already
3194 : : * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
3195 : : */
3196 : : static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3197 : : {
3198 : : struct ext4_prealloc_space *pa = ac->ac_pa;
3199 : :
3200 [ - + ][ # # ]: 28 : if (pa && pa->pa_type == MB_INODE_PA)
[ # # ][ # # ]
3201 : 0 : pa->pa_free += ac->ac_b_ex.fe_len;
3202 : : }
3203 : :
3204 : : /*
3205 : : * use blocks preallocated to inode
3206 : : */
3207 : 0 : static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3208 : : struct ext4_prealloc_space *pa)
3209 : : {
3210 : 147481 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3211 : : ext4_fsblk_t start;
3212 : : ext4_fsblk_t end;
3213 : : int len;
3214 : :
3215 : : /* found preallocated blocks, use them */
3216 : 147481 : start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3217 : 147481 : end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
3218 : : start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
3219 : 147481 : len = EXT4_NUM_B2C(sbi, end - start);
3220 : 147481 : ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3221 : : &ac->ac_b_ex.fe_start);
3222 : 294968 : ac->ac_b_ex.fe_len = len;
3223 : 294968 : ac->ac_status = AC_STATUS_FOUND;
3224 : 294968 : ac->ac_pa = pa;
3225 : :
3226 [ - + ]: 294968 : BUG_ON(start < pa->pa_pstart);
3227 [ - + ]: 147487 : BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
3228 [ - + ]: 147487 : BUG_ON(pa->pa_free < len);
3229 : 147487 : pa->pa_free -= len;
3230 : :
3231 : : mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3232 : 147487 : }
3233 : :
3234 : : /*
3235 : : * use blocks preallocated to locality group
3236 : : */
3237 : 0 : static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3238 : : struct ext4_prealloc_space *pa)
3239 : : {
3240 : 3285 : unsigned int len = ac->ac_o_ex.fe_len;
3241 : :
3242 : 3285 : ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3243 : : &ac->ac_b_ex.fe_group,
3244 : : &ac->ac_b_ex.fe_start);
3245 : 3285 : ac->ac_b_ex.fe_len = len;
3246 : 3285 : ac->ac_status = AC_STATUS_FOUND;
3247 : 3285 : ac->ac_pa = pa;
3248 : :
3249 : : /* we don't correct pa_pstart or pa_plen here to avoid
3250 : : * possible race when the group is being loaded concurrently
3251 : : * instead we correct pa later, after blocks are marked
3252 : : * in on-disk bitmap -- see ext4_mb_release_context()
3253 : : * Other CPUs are prevented from allocating from this pa by lg_mutex
3254 : : */
3255 : : mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3256 : 3285 : }
3257 : :
3258 : : /*
3259 : : * Return the prealloc space that have minimal distance
3260 : : * from the goal block. @cpa is the prealloc
3261 : : * space that is having currently known minimal distance
3262 : : * from the goal block.
3263 : : */
3264 : : static struct ext4_prealloc_space *
3265 : 0 : ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3266 : : struct ext4_prealloc_space *pa,
3267 : : struct ext4_prealloc_space *cpa)
3268 : : {
3269 : : ext4_fsblk_t cur_distance, new_distance;
3270 : :
3271 [ + + ]: 3277 : if (cpa == NULL) {
3272 : 3273 : atomic_inc(&pa->pa_count);
3273 : 3273 : return pa;
3274 : : }
3275 : 4 : cur_distance = abs(goal_block - cpa->pa_pstart);
3276 : 4 : new_distance = abs(goal_block - pa->pa_pstart);
3277 : :
3278 [ - + ]: 4 : if (cur_distance <= new_distance)
3279 : : return cpa;
3280 : :
3281 : : /* drop the previous reference */
3282 : 0 : atomic_dec(&cpa->pa_count);
3283 : 0 : atomic_inc(&pa->pa_count);
3284 : 0 : return pa;
3285 : : }
3286 : :
3287 : : /*
3288 : : * search goal blocks in preallocated space
3289 : : */
3290 : : static noinline_for_stack int
3291 : 0 : ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3292 : : {
3293 : 194500 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3294 : : int order, i;
3295 : 194500 : struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3296 : : struct ext4_locality_group *lg;
3297 : : struct ext4_prealloc_space *pa, *cpa = NULL;
3298 : : ext4_fsblk_t goal_block;
3299 : :
3300 : : /* only data can be preallocated */
3301 [ + + ]: 194500 : if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3302 : : return 0;
3303 : :
3304 : : /* first, try per-file preallocation */
3305 : : rcu_read_lock();
3306 [ + + ]: 334373 : list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3307 : :
3308 : : /* all fields in this condition don't change,
3309 : : * so we can skip locking for them */
3310 [ + + ][ + + ]: 303453 : if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3311 : 261256 : ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
3312 : 261256 : EXT4_C2B(sbi, pa->pa_len)))
3313 : 178586 : continue;
3314 : :
3315 : : /* non-extent files can't have physical blocks past 2^32 */
3316 [ - + ][ # # ]: 124867 : if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3317 : 0 : (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
3318 : : EXT4_MAX_BLOCK_FILE_PHYS))
3319 : 0 : continue;
3320 : :
3321 : : /* found preallocated blocks, use them */
3322 : : spin_lock(&pa->pa_lock);
3323 [ + ][ + + ]: 124978 : if (pa->pa_deleted == 0 && pa->pa_free) {
3324 : 124977 : atomic_inc(&pa->pa_count);
3325 : 124978 : ext4_mb_use_inode_pa(ac, pa);
3326 : : spin_unlock(&pa->pa_lock);
3327 : 124991 : ac->ac_criteria = 10;
3328 : : rcu_read_unlock();
3329 : 124990 : return 1;
3330 : : }
3331 : : spin_unlock(&pa->pa_lock);
3332 : : }
3333 : : rcu_read_unlock();
3334 : :
3335 : : /* can we use group allocation? */
3336 [ + + ]: 30920 : if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3337 : : return 0;
3338 : :
3339 : : /* inode may have no locality group for some reason */
3340 : 3286 : lg = ac->ac_lg;
3341 [ + ]: 3286 : if (lg == NULL)
3342 : : return 0;
3343 : 201072 : order = fls(ac->ac_o_ex.fe_len) - 1;
3344 [ - + ]: 3286 : if (order > PREALLOC_TB_SIZE - 1)
3345 : : /* The max size of hash table is PREALLOC_TB_SIZE */
3346 : : order = PREALLOC_TB_SIZE - 1;
3347 : :
3348 : 3286 : goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3349 : : /*
3350 : : * search for the prealloc space that is having
3351 : : * minimal distance from the goal block.
3352 : : */
3353 [ + + ]: 35413 : for (i = order; i < PREALLOC_TB_SIZE; i++) {
3354 : : rcu_read_lock();
3355 [ + + ]: 35404 : list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
3356 : : pa_inode_list) {
3357 : : spin_lock(&pa->pa_lock);
3358 [ + - ][ + - ]: 3277 : if (pa->pa_deleted == 0 &&
3359 : 3277 : pa->pa_free >= ac->ac_o_ex.fe_len) {
3360 : :
3361 : 3277 : cpa = ext4_mb_check_group_pa(goal_block,
3362 : : pa, cpa);
3363 : : }
3364 : : spin_unlock(&pa->pa_lock);
3365 : : }
3366 : : rcu_read_unlock();
3367 : : }
3368 [ + + ]: 3286 : if (cpa) {
3369 : 3273 : ext4_mb_use_group_pa(ac, cpa);
3370 : 3273 : ac->ac_criteria = 20;
3371 : 3273 : return 1;
3372 : : }
3373 : : return 0;
3374 : : }
3375 : :
3376 : : /*
3377 : : * the function goes through all block freed in the group
3378 : : * but not yet committed and marks them used in in-core bitmap.
3379 : : * buddy must be generated from this bitmap
3380 : : * Need to be called with the ext4 group lock held
3381 : : */
3382 : 0 : static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3383 : : ext4_group_t group)
3384 : : {
3385 : : struct rb_node *n;
3386 : : struct ext4_group_info *grp;
3387 : : struct ext4_free_data *entry;
3388 : :
3389 : : grp = ext4_get_group_info(sb, group);
3390 : 168 : n = rb_first(&(grp->bb_free_root));
3391 : :
3392 [ - + ]: 168 : while (n) {
3393 : : entry = rb_entry(n, struct ext4_free_data, efd_node);
3394 : 0 : ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
3395 : 0 : n = rb_next(n);
3396 : : }
3397 : 168 : return;
3398 : : }
3399 : :
3400 : : /*
3401 : : * the function goes through all preallocation in this group and marks them
3402 : : * used in in-core bitmap. buddy must be generated from this bitmap
3403 : : * Need to be called with ext4 group lock held
3404 : : */
3405 : : static noinline_for_stack
3406 : 0 : void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3407 : : ext4_group_t group)
3408 : : {
3409 : : struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3410 : : struct ext4_prealloc_space *pa;
3411 : : struct list_head *cur;
3412 : : ext4_group_t groupnr;
3413 : : ext4_grpblk_t start;
3414 : : int preallocated = 0;
3415 : : int len;
3416 : :
3417 : : /* all form of preallocation discards first load group,
3418 : : * so the only competing code is preallocation use.
3419 : : * we don't need any locking here
3420 : : * notice we do NOT ignore preallocations with pa_deleted
3421 : : * otherwise we could leave used blocks available for
3422 : : * allocation in buddy when concurrent ext4_mb_put_pa()
3423 : : * is dropping preallocation
3424 : : */
3425 [ + + ]: 187 : list_for_each(cur, &grp->bb_prealloc_list) {
3426 : : pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3427 : : spin_lock(&pa->pa_lock);
3428 : 19 : ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3429 : : &groupnr, &start);
3430 : 19 : len = pa->pa_len;
3431 : : spin_unlock(&pa->pa_lock);
3432 [ - + ]: 187 : if (unlikely(len == 0))
3433 : 0 : continue;
3434 [ - + ]: 19 : BUG_ON(groupnr != group);
3435 : 19 : ext4_set_bits(bitmap, start, len);
3436 : : preallocated += len;
3437 : : }
3438 : : mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3439 : 168 : }
3440 : :
3441 : 0 : static void ext4_mb_pa_callback(struct rcu_head *head)
3442 : : {
3443 : : struct ext4_prealloc_space *pa;
3444 : 22473 : pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3445 : :
3446 [ - + ]: 22473 : BUG_ON(atomic_read(&pa->pa_count));
3447 [ - + ]: 22473 : BUG_ON(pa->pa_deleted == 0);
3448 : 22473 : kmem_cache_free(ext4_pspace_cachep, pa);
3449 : 22496 : }
3450 : :
3451 : : /*
3452 : : * drops a reference to preallocated space descriptor
3453 : : * if this was the last reference and the space is consumed
3454 : : */
3455 : 150776 : static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3456 : : struct super_block *sb, struct ext4_prealloc_space *pa)
3457 : : {
3458 : : ext4_group_t grp;
3459 : : ext4_fsblk_t grp_blk;
3460 : :
3461 : : /* in this short window concurrent discard can set pa_deleted */
3462 : : spin_lock(&pa->pa_lock);
3463 [ + - ][ + + ]: 150764 : if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
3464 : : spin_unlock(&pa->pa_lock);
3465 : : return;
3466 : : }
3467 : :
3468 [ - + ]: 357 : if (pa->pa_deleted == 1) {
3469 : : spin_unlock(&pa->pa_lock);
3470 : : return;
3471 : : }
3472 : :
3473 : 357 : pa->pa_deleted = 1;
3474 : : spin_unlock(&pa->pa_lock);
3475 : :
3476 : 357 : grp_blk = pa->pa_pstart;
3477 : : /*
3478 : : * If doing group-based preallocation, pa_pstart may be in the
3479 : : * next group when pa is used up
3480 : : */
3481 [ + + ]: 357 : if (pa->pa_type == MB_GROUP_PA)
3482 : 8 : grp_blk--;
3483 : :
3484 : 357 : grp = ext4_get_group_number(sb, grp_blk);
3485 : :
3486 : : /*
3487 : : * possible race:
3488 : : *
3489 : : * P1 (buddy init) P2 (regular allocation)
3490 : : * find block B in PA
3491 : : * copy on-disk bitmap to buddy
3492 : : * mark B in on-disk bitmap
3493 : : * drop PA from group
3494 : : * mark all PAs in buddy
3495 : : *
3496 : : * thus, P1 initializes buddy with B available. to prevent this
3497 : : * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
3498 : : * against that pair
3499 : : */
3500 : : ext4_lock_group(sb, grp);
3501 : : list_del(&pa->pa_group_list);
3502 : : ext4_unlock_group(sb, grp);
3503 : :
3504 : 357 : spin_lock(pa->pa_obj_lock);
3505 : : list_del_rcu(&pa->pa_inode_list);
3506 : 357 : spin_unlock(pa->pa_obj_lock);
3507 : :
3508 : 357 : call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3509 : : }
3510 : :
3511 : : /*
3512 : : * creates new preallocated space for given inode
3513 : : */
3514 : : static noinline_for_stack int
3515 : 0 : ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3516 : : {
3517 : 22506 : struct super_block *sb = ac->ac_sb;
3518 : : struct ext4_sb_info *sbi = EXT4_SB(sb);
3519 : : struct ext4_prealloc_space *pa;
3520 : : struct ext4_group_info *grp;
3521 : : struct ext4_inode_info *ei;
3522 : :
3523 : : /* preallocate only when found space is larger then requested */
3524 [ - + ]: 22506 : BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3525 [ - + ]: 22506 : BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3526 [ - + ]: 22506 : BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3527 : :
3528 : 22506 : pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3529 [ + - ]: 22506 : if (pa == NULL)
3530 : : return -ENOMEM;
3531 : :
3532 [ + + ]: 22506 : if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3533 : : int winl;
3534 : : int wins;
3535 : : int win;
3536 : : int offs;
3537 : :
3538 : : /* we can't allocate as much as normalizer wants.
3539 : : * so, found space must get proper lstart
3540 : : * to cover original request */
3541 [ - + ]: 1 : BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
3542 [ - + ]: 1 : BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
3543 : :
3544 : : /* we're limited by original request in that
3545 : : * logical block must be covered any way
3546 : : * winl is window we can move our chunk within */
3547 : 1 : winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3548 : :
3549 : : /* also, we should cover whole original request */
3550 : 1 : wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
3551 : :
3552 : : /* the smallest one defines real window */
3553 : 1 : win = min(winl, wins);
3554 : :
3555 : 1 : offs = ac->ac_o_ex.fe_logical %
3556 : 1 : EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
3557 [ + - ]: 1 : if (offs && offs < win)
3558 : : win = offs;
3559 : :
3560 : 1 : ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
3561 : 1 : EXT4_NUM_B2C(sbi, win);
3562 [ - + ]: 1 : BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3563 [ - + ]: 1 : BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3564 : : }
3565 : :
3566 : : /* preallocation can change ac_b_ex, thus we store actually
3567 : : * allocated blocks for history */
3568 : 22506 : ac->ac_f_ex = ac->ac_b_ex;
3569 : :
3570 : 22506 : pa->pa_lstart = ac->ac_b_ex.fe_logical;
3571 : 45012 : pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3572 : 22506 : pa->pa_len = ac->ac_b_ex.fe_len;
3573 : 22506 : pa->pa_free = pa->pa_len;
3574 : 22506 : atomic_set(&pa->pa_count, 1);
3575 : 22506 : spin_lock_init(&pa->pa_lock);
3576 : 22506 : INIT_LIST_HEAD(&pa->pa_inode_list);
3577 : 22506 : INIT_LIST_HEAD(&pa->pa_group_list);
3578 : 22506 : pa->pa_deleted = 0;
3579 : 22506 : pa->pa_type = MB_INODE_PA;
3580 : :
3581 : : mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3582 : : pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3583 : : trace_ext4_mb_new_inode_pa(ac, pa);
3584 : :
3585 : 0 : ext4_mb_use_inode_pa(ac, pa);
3586 : 22507 : atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
3587 : :
3588 : 22506 : ei = EXT4_I(ac->ac_inode);
3589 : 22506 : grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3590 : :
3591 : 22506 : pa->pa_obj_lock = &ei->i_prealloc_lock;
3592 : 22506 : pa->pa_inode = ac->ac_inode;
3593 : :
3594 : 22506 : ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3595 : 22506 : list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3596 : 22506 : ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3597 : :
3598 : 22507 : spin_lock(pa->pa_obj_lock);
3599 : 22506 : list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
3600 : 22506 : spin_unlock(pa->pa_obj_lock);
3601 : :
3602 : 22507 : return 0;
3603 : : }
3604 : :
3605 : : /*
3606 : : * creates new preallocated space for locality group inodes belongs to
3607 : : */
3608 : : static noinline_for_stack int
3609 : 0 : ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3610 : : {
3611 : 24 : struct super_block *sb = ac->ac_sb;
3612 : : struct ext4_locality_group *lg;
3613 : : struct ext4_prealloc_space *pa;
3614 : : struct ext4_group_info *grp;
3615 : :
3616 : : /* preallocate only when found space is larger then requested */
3617 [ - + ]: 12 : BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3618 [ - + ]: 12 : BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3619 [ - + ]: 12 : BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3620 : :
3621 [ - + ]: 12 : BUG_ON(ext4_pspace_cachep == NULL);
3622 : 12 : pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3623 [ + - ]: 12 : if (pa == NULL)
3624 : : return -ENOMEM;
3625 : :
3626 : : /* preallocation can change ac_b_ex, thus we store actually
3627 : : * allocated blocks for history */
3628 : 12 : ac->ac_f_ex = ac->ac_b_ex;
3629 : :
3630 : 24 : pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3631 : 12 : pa->pa_lstart = pa->pa_pstart;
3632 : 12 : pa->pa_len = ac->ac_b_ex.fe_len;
3633 : 12 : pa->pa_free = pa->pa_len;
3634 : 12 : atomic_set(&pa->pa_count, 1);
3635 : 12 : spin_lock_init(&pa->pa_lock);
3636 : 12 : INIT_LIST_HEAD(&pa->pa_inode_list);
3637 : 12 : INIT_LIST_HEAD(&pa->pa_group_list);
3638 : 12 : pa->pa_deleted = 0;
3639 : 12 : pa->pa_type = MB_GROUP_PA;
3640 : :
3641 : : mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3642 : : pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3643 : : trace_ext4_mb_new_group_pa(ac, pa);
3644 : :
3645 : 12 : ext4_mb_use_group_pa(ac, pa);
3646 : 24 : atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3647 : :
3648 : 12 : grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3649 : 12 : lg = ac->ac_lg;
3650 [ - + ]: 12 : BUG_ON(lg == NULL);
3651 : :
3652 : 12 : pa->pa_obj_lock = &lg->lg_prealloc_lock;
3653 : 12 : pa->pa_inode = NULL;
3654 : :
3655 : 12 : ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3656 : 12 : list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3657 : 12 : ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3658 : :
3659 : : /*
3660 : : * We will later add the new pa to the right bucket
3661 : : * after updating the pa_free in ext4_mb_release_context
3662 : : */
3663 : 12 : return 0;
3664 : : }
3665 : :
3666 : 0 : static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3667 : : {
3668 : : int err;
3669 : :
3670 [ + + ]: 22519 : if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3671 : 12 : err = ext4_mb_new_group_pa(ac);
3672 : : else
3673 : 22507 : err = ext4_mb_new_inode_pa(ac);
3674 : 22517 : return err;
3675 : : }
3676 : :
3677 : : /*
3678 : : * finds all unused blocks in on-disk bitmap, frees them in
3679 : : * in-core bitmap and buddy.
3680 : : * @pa must be unlinked from inode and group lists, so that
3681 : : * nobody else can find/use it.
3682 : : * the caller MUST hold group/inode locks.
3683 : : * TODO: optimize the case when there are no in-core structures yet
3684 : : */
3685 : : static noinline_for_stack int
3686 : 0 : ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3687 : : struct ext4_prealloc_space *pa)
3688 : : {
3689 : 22156 : struct super_block *sb = e4b->bd_sb;
3690 : : struct ext4_sb_info *sbi = EXT4_SB(sb);
3691 : : unsigned int end;
3692 : : unsigned int next;
3693 : : ext4_group_t group;
3694 : : ext4_grpblk_t bit;
3695 : : unsigned long long grp_blk_start;
3696 : : int err = 0;
3697 : : int free = 0;
3698 : :
3699 [ - + ]: 22156 : BUG_ON(pa->pa_deleted == 0);
3700 : 22156 : ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3701 : 22151 : grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
3702 [ - + ][ # # ]: 22151 : BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3703 : 22151 : end = bit + pa->pa_len;
3704 : :
3705 [ + + ]: 105034 : while (bit < end) {
3706 : 166873 : bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3707 [ + + ]: 83435 : if (bit >= end)
3708 : : break;
3709 : 165788 : next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3710 : : mb_debug(1, " free preallocated %u/%u in group %u\n",
3711 : : (unsigned) ext4_group_first_block_no(sb, group) + bit,
3712 : : (unsigned) next - bit, (unsigned) group);
3713 : 82903 : free += next - bit;
3714 : :
3715 : 82903 : trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3716 : 82899 : trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
3717 : 82899 : EXT4_C2B(sbi, bit)),
3718 : : next - bit);
3719 : 82899 : mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3720 : 82883 : bit = next + 1;
3721 : : }
3722 [ - + ]: 22146 : if (free != pa->pa_free) {
3723 : 0 : ext4_msg(e4b->bd_sb, KERN_CRIT,
3724 : : "pa %p: logic %lu, phys. %lu, len %lu",
3725 : : pa, (unsigned long) pa->pa_lstart,
3726 : : (unsigned long) pa->pa_pstart,
3727 : : (unsigned long) pa->pa_len);
3728 : 0 : ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3729 : : free, pa->pa_free);
3730 : : /*
3731 : : * pa is already deleted so we use the value obtained
3732 : : * from the bitmap and continue.
3733 : : */
3734 : : }
3735 : 0 : atomic_add(free, &sbi->s_mb_discarded);
3736 : :
3737 : 22158 : return err;
3738 : : }
3739 : :
3740 : : static noinline_for_stack int
3741 : 0 : ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3742 : : struct ext4_prealloc_space *pa)
3743 : : {
3744 : 0 : struct super_block *sb = e4b->bd_sb;
3745 : : ext4_group_t group;
3746 : : ext4_grpblk_t bit;
3747 : :
3748 : : trace_ext4_mb_release_group_pa(sb, pa);
3749 [ # # ]: 0 : BUG_ON(pa->pa_deleted == 0);
3750 : 0 : ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3751 [ # # ][ # # ]: 0 : BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3752 : 0 : mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3753 : 0 : atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3754 : 0 : trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3755 : :
3756 : 0 : return 0;
3757 : : }
3758 : :
3759 : : /*
3760 : : * releases all preallocations in given group
3761 : : *
3762 : : * first, we need to decide discard policy:
3763 : : * - when do we discard
3764 : : * 1) ENOSPC
3765 : : * - how many do we discard
3766 : : * 1) how many requested
3767 : : */
3768 : : static noinline_for_stack int
3769 : 0 : ext4_mb_discard_group_preallocations(struct super_block *sb,
3770 : : ext4_group_t group, int needed)
3771 : : {
3772 : : struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3773 : : struct buffer_head *bitmap_bh = NULL;
3774 : : struct ext4_prealloc_space *pa, *tmp;
3775 : : struct list_head list;
3776 : : struct ext4_buddy e4b;
3777 : : int err;
3778 : : int busy = 0;
3779 : : int free = 0;
3780 : :
3781 : : mb_debug(1, "discard preallocation for group %u\n", group);
3782 : :
3783 [ # # ]: 0 : if (list_empty(&grp->bb_prealloc_list))
3784 : : return 0;
3785 : :
3786 : 0 : bitmap_bh = ext4_read_block_bitmap(sb, group);
3787 [ # # ]: 0 : if (bitmap_bh == NULL) {
3788 : 0 : ext4_error(sb, "Error reading block bitmap for %u", group);
3789 : 0 : return 0;
3790 : : }
3791 : :
3792 : 0 : err = ext4_mb_load_buddy(sb, group, &e4b);
3793 [ # # ]: 0 : if (err) {
3794 : 0 : ext4_error(sb, "Error loading buddy information for %u", group);
3795 : : put_bh(bitmap_bh);
3796 : 0 : return 0;
3797 : : }
3798 : :
3799 [ # # ]: 0 : if (needed == 0)
3800 : 0 : needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
3801 : :
3802 : : INIT_LIST_HEAD(&list);
3803 : : repeat:
3804 : : ext4_lock_group(sb, group);
3805 [ # # ]: 0 : list_for_each_entry_safe(pa, tmp,
3806 : : &grp->bb_prealloc_list, pa_group_list) {
3807 : : spin_lock(&pa->pa_lock);
3808 [ # # ]: 0 : if (atomic_read(&pa->pa_count)) {
3809 : : spin_unlock(&pa->pa_lock);
3810 : : busy = 1;
3811 : 0 : continue;
3812 : : }
3813 [ # # ]: 0 : if (pa->pa_deleted) {
3814 : : spin_unlock(&pa->pa_lock);
3815 : 0 : continue;
3816 : : }
3817 : :
3818 : : /* seems this one can be freed ... */
3819 : 0 : pa->pa_deleted = 1;
3820 : :
3821 : : /* we can trust pa_free ... */
3822 : 0 : free += pa->pa_free;
3823 : :
3824 : : spin_unlock(&pa->pa_lock);
3825 : :
3826 : : list_del(&pa->pa_group_list);
3827 : 0 : list_add(&pa->u.pa_tmp_list, &list);
3828 : : }
3829 : :
3830 : : /* if we still need more blocks and some PAs were used, try again */
3831 [ # # ]: 0 : if (free < needed && busy) {
3832 : : busy = 0;
3833 : : ext4_unlock_group(sb, group);
3834 : 0 : cond_resched();
3835 : 0 : goto repeat;
3836 : : }
3837 : :
3838 : : /* found anything to free? */
3839 [ # # ]: 0 : if (list_empty(&list)) {
3840 [ # # ]: 0 : BUG_ON(free != 0);
3841 : : goto out;
3842 : : }
3843 : :
3844 : : /* now free all selected PAs */
3845 [ # # ]: 0 : list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3846 : :
3847 : : /* remove from object (inode or locality group) */
3848 : 0 : spin_lock(pa->pa_obj_lock);
3849 : : list_del_rcu(&pa->pa_inode_list);
3850 : 0 : spin_unlock(pa->pa_obj_lock);
3851 : :
3852 [ # # ]: 0 : if (pa->pa_type == MB_GROUP_PA)
3853 : 0 : ext4_mb_release_group_pa(&e4b, pa);
3854 : : else
3855 : 0 : ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3856 : :
3857 : : list_del(&pa->u.pa_tmp_list);
3858 : 0 : call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3859 : : }
3860 : :
3861 : : out:
3862 : : ext4_unlock_group(sb, group);
3863 : 0 : ext4_mb_unload_buddy(&e4b);
3864 : : put_bh(bitmap_bh);
3865 : 0 : return free;
3866 : : }
3867 : :
3868 : : /*
3869 : : * releases all non-used preallocated blocks for given inode
3870 : : *
3871 : : * It's important to discard preallocations under i_data_sem
3872 : : * We don't want another block to be served from the prealloc
3873 : : * space when we are discarding the inode prealloc space.
3874 : : *
3875 : : * FIXME!! Make sure it is valid at all the call sites
3876 : : */
3877 : 0 : void ext4_discard_preallocations(struct inode *inode)
3878 : : {
3879 : : struct ext4_inode_info *ei = EXT4_I(inode);
3880 : 558111 : struct super_block *sb = inode->i_sb;
3881 : : struct buffer_head *bitmap_bh = NULL;
3882 : : struct ext4_prealloc_space *pa, *tmp;
3883 : : ext4_group_t group = 0;
3884 : : struct list_head list;
3885 : : struct ext4_buddy e4b;
3886 : : int err;
3887 : :
3888 [ + + ]: 558111 : if (!S_ISREG(inode->i_mode)) {
3889 : : /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3890 : 123971 : return;
3891 : : }
3892 : :
3893 : : mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
3894 : : trace_ext4_discard_preallocations(inode);
3895 : :
3896 : : INIT_LIST_HEAD(&list);
3897 : :
3898 : : repeat:
3899 : : /* first, collect all pa's in the inode */
3900 : : spin_lock(&ei->i_prealloc_lock);
3901 [ + + ]: 456295 : while (!list_empty(&ei->i_prealloc_list)) {
3902 : 22154 : pa = list_entry(ei->i_prealloc_list.next,
3903 : : struct ext4_prealloc_space, pa_inode_list);
3904 [ + + ]: 22154 : BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
3905 : : spin_lock(&pa->pa_lock);
3906 [ - + ]: 22149 : if (atomic_read(&pa->pa_count)) {
3907 : : /* this shouldn't happen often - nobody should
3908 : : * use preallocation while we're discarding it */
3909 : : spin_unlock(&pa->pa_lock);
3910 : : spin_unlock(&ei->i_prealloc_lock);
3911 : 0 : ext4_msg(sb, KERN_ERR,
3912 : : "uh-oh! used pa while discarding");
3913 : 0 : WARN_ON(1);
3914 : 0 : schedule_timeout_uninterruptible(HZ);
3915 : 0 : goto repeat;
3916 : :
3917 : : }
3918 [ + - ]: 22149 : if (pa->pa_deleted == 0) {
3919 : 22149 : pa->pa_deleted = 1;
3920 : : spin_unlock(&pa->pa_lock);
3921 : : list_del_rcu(&pa->pa_inode_list);
3922 : 22151 : list_add(&pa->u.pa_tmp_list, &list);
3923 : 22151 : continue;
3924 : : }
3925 : :
3926 : : /* someone is deleting pa right now */
3927 : : spin_unlock(&pa->pa_lock);
3928 : : spin_unlock(&ei->i_prealloc_lock);
3929 : :
3930 : : /* we have to wait here because pa_deleted
3931 : : * doesn't mean pa is already unlinked from
3932 : : * the list. as we might be called from
3933 : : * ->clear_inode() the inode will get freed
3934 : : * and concurrent thread which is unlinking
3935 : : * pa from inode's list may access already
3936 : : * freed memory, bad-bad-bad */
3937 : :
3938 : : /* XXX: if this happens too often, we can
3939 : : * add a flag to force wait only in case
3940 : : * of ->clear_inode(), but not in case of
3941 : : * regular truncate */
3942 : 0 : schedule_timeout_uninterruptible(HZ);
3943 : 0 : goto repeat;
3944 : : }
3945 : : spin_unlock(&ei->i_prealloc_lock);
3946 : :
3947 [ + + ]: 1014405 : list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3948 [ + + ]: 22153 : BUG_ON(pa->pa_type != MB_INODE_PA);
3949 : 22152 : group = ext4_get_group_number(sb, pa->pa_pstart);
3950 : :
3951 : 22154 : err = ext4_mb_load_buddy(sb, group, &e4b);
3952 [ - + ]: 22156 : if (err) {
3953 : 0 : ext4_error(sb, "Error loading buddy information for %u",
3954 : : group);
3955 : 0 : continue;
3956 : : }
3957 : :
3958 : 22156 : bitmap_bh = ext4_read_block_bitmap(sb, group);
3959 [ - + ]: 22147 : if (bitmap_bh == NULL) {
3960 : 0 : ext4_error(sb, "Error reading block bitmap for %u",
3961 : : group);
3962 : 0 : ext4_mb_unload_buddy(&e4b);
3963 : 0 : continue;
3964 : : }
3965 : :
3966 : : ext4_lock_group(sb, group);
3967 : : list_del(&pa->pa_group_list);
3968 : 22156 : ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3969 : : ext4_unlock_group(sb, group);
3970 : :
3971 : 22158 : ext4_mb_unload_buddy(&e4b);
3972 : : put_bh(bitmap_bh);
3973 : :
3974 : : list_del(&pa->u.pa_tmp_list);
3975 : 22157 : call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3976 : : }
3977 : : }
3978 : :
3979 : : #ifdef CONFIG_EXT4_DEBUG
3980 : : static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3981 : : {
3982 : : struct super_block *sb = ac->ac_sb;
3983 : : ext4_group_t ngroups, i;
3984 : :
3985 : : if (!ext4_mballoc_debug ||
3986 : : (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
3987 : : return;
3988 : :
3989 : : ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
3990 : : " Allocation context details:");
3991 : : ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
3992 : : ac->ac_status, ac->ac_flags);
3993 : : ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
3994 : : "goal %lu/%lu/%lu@%lu, "
3995 : : "best %lu/%lu/%lu@%lu cr %d",
3996 : : (unsigned long)ac->ac_o_ex.fe_group,
3997 : : (unsigned long)ac->ac_o_ex.fe_start,
3998 : : (unsigned long)ac->ac_o_ex.fe_len,
3999 : : (unsigned long)ac->ac_o_ex.fe_logical,
4000 : : (unsigned long)ac->ac_g_ex.fe_group,
4001 : : (unsigned long)ac->ac_g_ex.fe_start,
4002 : : (unsigned long)ac->ac_g_ex.fe_len,
4003 : : (unsigned long)ac->ac_g_ex.fe_logical,
4004 : : (unsigned long)ac->ac_b_ex.fe_group,
4005 : : (unsigned long)ac->ac_b_ex.fe_start,
4006 : : (unsigned long)ac->ac_b_ex.fe_len,
4007 : : (unsigned long)ac->ac_b_ex.fe_logical,
4008 : : (int)ac->ac_criteria);
4009 : : ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
4010 : : ac->ac_ex_scanned, ac->ac_found);
4011 : : ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
4012 : : ngroups = ext4_get_groups_count(sb);
4013 : : for (i = 0; i < ngroups; i++) {
4014 : : struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4015 : : struct ext4_prealloc_space *pa;
4016 : : ext4_grpblk_t start;
4017 : : struct list_head *cur;
4018 : : ext4_lock_group(sb, i);
4019 : : list_for_each(cur, &grp->bb_prealloc_list) {
4020 : : pa = list_entry(cur, struct ext4_prealloc_space,
4021 : : pa_group_list);
4022 : : spin_lock(&pa->pa_lock);
4023 : : ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4024 : : NULL, &start);
4025 : : spin_unlock(&pa->pa_lock);
4026 : : printk(KERN_ERR "PA:%u:%d:%u \n", i,
4027 : : start, pa->pa_len);
4028 : : }
4029 : : ext4_unlock_group(sb, i);
4030 : :
4031 : : if (grp->bb_free == 0)
4032 : : continue;
4033 : : printk(KERN_ERR "%u: %d/%d \n",
4034 : : i, grp->bb_free, grp->bb_fragments);
4035 : : }
4036 : : printk(KERN_ERR "\n");
4037 : : }
4038 : : #else
4039 : : static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4040 : : {
4041 : : return;
4042 : : }
4043 : : #endif
4044 : :
4045 : : /*
4046 : : * We use locality group preallocation for small size file. The size of the
4047 : : * file is determined by the current size or the resulting size after
4048 : : * allocation which ever is larger
4049 : : *
4050 : : * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
4051 : : */
4052 : 0 : static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4053 : : {
4054 : 194555 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4055 : 194555 : int bsbits = ac->ac_sb->s_blocksize_bits;
4056 : : loff_t size, isize;
4057 : :
4058 [ + + ]: 194555 : if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4059 : : return;
4060 : :
4061 [ + + ]: 155907 : if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4062 : : return;
4063 : :
4064 : 155904 : size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
4065 : 156428 : isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4066 : : >> bsbits;
4067 : :
4068 [ + + ][ + + ]: 350444 : if ((size == isize) &&
4069 [ + + ]: 10683 : !ext4_fs_is_busy(sbi) &&
4070 : 10683 : (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4071 : 1993 : ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4072 : 1993 : return;
4073 : : }
4074 : :
4075 [ - + ]: 153896 : if (sbi->s_mb_group_prealloc <= 0) {
4076 : 0 : ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4077 : 0 : return;
4078 : : }
4079 : :
4080 : : /* don't use group allocation for large files */
4081 : 153896 : size = max(size, isize);
4082 [ + + ]: 153896 : if (size > sbi->s_mb_stream_request) {
4083 : 150609 : ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4084 : 150609 : return;
4085 : : }
4086 : :
4087 [ - + ]: 3287 : BUG_ON(ac->ac_lg != NULL);
4088 : : /*
4089 : : * locality group prealloc space are per cpu. The reason for having
4090 : : * per cpu locality group is to reduce the contention between block
4091 : : * request from multiple CPUs.
4092 : : */
4093 : 6574 : ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
4094 : :
4095 : : /* we're going to use group allocation */
4096 : 3287 : ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
4097 : :
4098 : : /* serialize all allocations in the group */
4099 : 3287 : mutex_lock(&ac->ac_lg->lg_mutex);
4100 : : }
4101 : :
4102 : : static noinline_for_stack int
4103 : 0 : ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4104 : : struct ext4_allocation_request *ar)
4105 : : {
4106 : 194549 : struct super_block *sb = ar->inode->i_sb;
4107 : : struct ext4_sb_info *sbi = EXT4_SB(sb);
4108 : 389131 : struct ext4_super_block *es = sbi->s_es;
4109 : : ext4_group_t group;
4110 : : unsigned int len;
4111 : : ext4_fsblk_t goal;
4112 : : ext4_grpblk_t block;
4113 : :
4114 : : /* we can't allocate > group size */
4115 : 194549 : len = ar->len;
4116 : :
4117 : : /* just a dirty hack to filter too big requests */
4118 [ - + ]: 194549 : if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
4119 : : len = EXT4_CLUSTERS_PER_GROUP(sb);
4120 : :
4121 : : /* start searching from the goal */
4122 : 194549 : goal = ar->goal;
4123 [ + ][ + + ]: 194549 : if (goal < le32_to_cpu(es->s_first_data_block) ||
4124 : : goal >= ext4_blocks_count(es))
4125 : : goal = le32_to_cpu(es->s_first_data_block);
4126 : 194549 : ext4_get_group_no_and_offset(sb, goal, &group, &block);
4127 : :
4128 : : /* set up allocation goals */
4129 : 194569 : ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
4130 : 194569 : ac->ac_status = AC_STATUS_CONTINUE;
4131 : 194569 : ac->ac_sb = sb;
4132 : 194569 : ac->ac_inode = ar->inode;
4133 : 194569 : ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
4134 : 194569 : ac->ac_o_ex.fe_group = group;
4135 : 194569 : ac->ac_o_ex.fe_start = block;
4136 : 194569 : ac->ac_o_ex.fe_len = len;
4137 : 194569 : ac->ac_g_ex = ac->ac_o_ex;
4138 : 194569 : ac->ac_flags = ar->flags;
4139 : :
4140 : : /* we have to define context: we'll we work with a file or
4141 : : * locality group. this is a policy, actually */
4142 : 194569 : ext4_mb_group_or_file(ac);
4143 : :
4144 : : mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4145 : : "left: %u/%u, right %u/%u to %swritable\n",
4146 : : (unsigned) ar->len, (unsigned) ar->logical,
4147 : : (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4148 : : (unsigned) ar->lleft, (unsigned) ar->pleft,
4149 : : (unsigned) ar->lright, (unsigned) ar->pright,
4150 : : atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4151 : 194508 : return 0;
4152 : :
4153 : : }
4154 : :
4155 : : static noinline_for_stack void
4156 : 0 : ext4_mb_discard_lg_preallocations(struct super_block *sb,
4157 : : struct ext4_locality_group *lg,
4158 : : int order, int total_entries)
4159 : : {
4160 : : ext4_group_t group = 0;
4161 : : struct ext4_buddy e4b;
4162 : : struct list_head discard_list;
4163 : : struct ext4_prealloc_space *pa, *tmp;
4164 : :
4165 : : mb_debug(1, "discard locality group preallocation\n");
4166 : :
4167 : : INIT_LIST_HEAD(&discard_list);
4168 : :
4169 : : spin_lock(&lg->lg_prealloc_lock);
4170 [ # # ]: 0 : list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4171 : : pa_inode_list) {
4172 : : spin_lock(&pa->pa_lock);
4173 [ # # ]: 0 : if (atomic_read(&pa->pa_count)) {
4174 : : /*
4175 : : * This is the pa that we just used
4176 : : * for block allocation. So don't
4177 : : * free that
4178 : : */
4179 : : spin_unlock(&pa->pa_lock);
4180 : 0 : continue;
4181 : : }
4182 [ # # ]: 0 : if (pa->pa_deleted) {
4183 : : spin_unlock(&pa->pa_lock);
4184 : 0 : continue;
4185 : : }
4186 : : /* only lg prealloc space */
4187 [ # # ]: 0 : BUG_ON(pa->pa_type != MB_GROUP_PA);
4188 : :
4189 : : /* seems this one can be freed ... */
4190 : 0 : pa->pa_deleted = 1;
4191 : : spin_unlock(&pa->pa_lock);
4192 : :
4193 : : list_del_rcu(&pa->pa_inode_list);
4194 : 0 : list_add(&pa->u.pa_tmp_list, &discard_list);
4195 : :
4196 : 0 : total_entries--;
4197 [ # # ]: 0 : if (total_entries <= 5) {
4198 : : /*
4199 : : * we want to keep only 5 entries
4200 : : * allowing it to grow to 8. This
4201 : : * mak sure we don't call discard
4202 : : * soon for this list.
4203 : : */
4204 : : break;
4205 : : }
4206 : : }
4207 : : spin_unlock(&lg->lg_prealloc_lock);
4208 : :
4209 [ # # ]: 0 : list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4210 : :
4211 : 0 : group = ext4_get_group_number(sb, pa->pa_pstart);
4212 [ # # ]: 0 : if (ext4_mb_load_buddy(sb, group, &e4b)) {
4213 : 0 : ext4_error(sb, "Error loading buddy information for %u",
4214 : : group);
4215 : 0 : continue;
4216 : : }
4217 : : ext4_lock_group(sb, group);
4218 : : list_del(&pa->pa_group_list);
4219 : 0 : ext4_mb_release_group_pa(&e4b, pa);
4220 : : ext4_unlock_group(sb, group);
4221 : :
4222 : 0 : ext4_mb_unload_buddy(&e4b);
4223 : : list_del(&pa->u.pa_tmp_list);
4224 : 0 : call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4225 : : }
4226 : 0 : }
4227 : :
4228 : : /*
4229 : : * We have incremented pa_count. So it cannot be freed at this
4230 : : * point. Also we hold lg_mutex. So no parallel allocation is
4231 : : * possible from this lg. That means pa_free cannot be updated.
4232 : : *
4233 : : * A parallel ext4_mb_discard_group_preallocations is possible.
4234 : : * which can cause the lg_prealloc_list to be updated.
4235 : : */
4236 : :
4237 : 0 : static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4238 : : {
4239 : : int order, added = 0, lg_prealloc_count = 1;
4240 : 3277 : struct super_block *sb = ac->ac_sb;
4241 : 3277 : struct ext4_locality_group *lg = ac->ac_lg;
4242 : 3277 : struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
4243 : :
4244 : 3277 : order = fls(pa->pa_free) - 1;
4245 [ # # ]: 0 : if (order > PREALLOC_TB_SIZE - 1)
4246 : : /* The max size of hash table is PREALLOC_TB_SIZE */
4247 : : order = PREALLOC_TB_SIZE - 1;
4248 : : /* Add the prealloc space to lg */
4249 : : spin_lock(&lg->lg_prealloc_lock);
4250 [ - + ]: 3277 : list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4251 : : pa_inode_list) {
4252 : : spin_lock(&tmp_pa->pa_lock);
4253 [ # # ]: 0 : if (tmp_pa->pa_deleted) {
4254 : : spin_unlock(&tmp_pa->pa_lock);
4255 : 0 : continue;
4256 : : }
4257 [ # # ][ # # ]: 0 : if (!added && pa->pa_free < tmp_pa->pa_free) {
4258 : : /* Add to the tail of the previous entry */
4259 : 0 : list_add_tail_rcu(&pa->pa_inode_list,
4260 : : &tmp_pa->pa_inode_list);
4261 : : added = 1;
4262 : : /*
4263 : : * we want to count the total
4264 : : * number of entries in the list
4265 : : */
4266 : : }
4267 : : spin_unlock(&tmp_pa->pa_lock);
4268 : 0 : lg_prealloc_count++;
4269 : : }
4270 [ + - ]: 3277 : if (!added)
4271 : 3277 : list_add_tail_rcu(&pa->pa_inode_list,
4272 : : &lg->lg_prealloc_list[order]);
4273 : : spin_unlock(&lg->lg_prealloc_lock);
4274 : :
4275 : : /* Now trim the list to be not more than 8 elements */
4276 [ - + ]: 3277 : if (lg_prealloc_count > 8) {
4277 : 0 : ext4_mb_discard_lg_preallocations(sb, lg,
4278 : : order, lg_prealloc_count);
4279 : 0 : return;
4280 : : }
4281 : : return ;
4282 : : }
4283 : :
4284 : : /*
4285 : : * release all resource we used in allocation
4286 : : */
4287 : 0 : static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4288 : : {
4289 : 194575 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4290 : 194575 : struct ext4_prealloc_space *pa = ac->ac_pa;
4291 [ + + ]: 194575 : if (pa) {
4292 [ + + ]: 150777 : if (pa->pa_type == MB_GROUP_PA) {
4293 : : /* see comment in ext4_mb_use_group_pa() */
4294 : : spin_lock(&pa->pa_lock);
4295 : 3285 : pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4296 : 3285 : pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4297 : 3285 : pa->pa_free -= ac->ac_b_ex.fe_len;
4298 : 3285 : pa->pa_len -= ac->ac_b_ex.fe_len;
4299 : : spin_unlock(&pa->pa_lock);
4300 : : }
4301 : : }
4302 [ + + ]: 389139 : if (pa) {
4303 : : /*
4304 : : * We want to add the pa to the right bucket.
4305 : : * Remove it from the list and while adding
4306 : : * make sure the list to which we are adding
4307 : : * doesn't grow big.
4308 : : */
4309 [ + + ][ + + ]: 150774 : if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4310 : 3277 : spin_lock(pa->pa_obj_lock);
4311 : : list_del_rcu(&pa->pa_inode_list);
4312 : 3277 : spin_unlock(pa->pa_obj_lock);
4313 : 3277 : ext4_mb_add_n_trim(ac);
4314 : : }
4315 : 150774 : ext4_mb_put_pa(ac, ac->ac_sb, pa);
4316 : : }
4317 [ + + ]: 389143 : if (ac->ac_bitmap_page)
4318 : 66306 : page_cache_release(ac->ac_bitmap_page);
4319 [ + + ]: 194559 : if (ac->ac_buddy_page)
4320 : 66292 : page_cache_release(ac->ac_buddy_page);
4321 [ + + ]: 194570 : if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4322 : 3287 : mutex_unlock(&ac->ac_lg->lg_mutex);
4323 : 194570 : ext4_mb_collect_stats(ac);
4324 : 194542 : return 0;
4325 : : }
4326 : :
4327 : 0 : static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4328 : : {
4329 : : ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4330 : : int ret;
4331 : : int freed = 0;
4332 : :
4333 : : trace_ext4_mb_discard_preallocations(sb, needed);
4334 [ # # ]: 0 : for (i = 0; i < ngroups && needed > 0; i++) {
4335 : 0 : ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4336 : 0 : freed += ret;
4337 : 0 : needed -= ret;
4338 : : }
4339 : :
4340 : 0 : return freed;
4341 : : }
4342 : :
4343 : : /*
4344 : : * Main entry point into mballoc to allocate blocks
4345 : : * it tries to use preallocation first, then falls back
4346 : : * to usual allocation
4347 : : */
4348 : 0 : ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4349 : : struct ext4_allocation_request *ar, int *errp)
4350 : : {
4351 : : int freed;
4352 : 28 : struct ext4_allocation_context *ac = NULL;
4353 : : struct ext4_sb_info *sbi;
4354 : 194424 : struct super_block *sb;
4355 : : ext4_fsblk_t block = 0;
4356 : : unsigned int inquota = 0;
4357 : : unsigned int reserv_clstrs = 0;
4358 : :
4359 : : might_sleep();
4360 : 194424 : sb = ar->inode->i_sb;
4361 : : sbi = EXT4_SB(sb);
4362 : :
4363 : : trace_ext4_request_blocks(ar);
4364 : :
4365 : : /* Allow to use superuser reservation for quota file */
4366 [ - + ]: 388865 : if (IS_NOQUOTA(ar->inode))
4367 : 0 : ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
4368 : :
4369 : : /*
4370 : : * For delayed allocation, we could skip the ENOSPC and
4371 : : * EDQUOT check, as blocks and quotas have been already
4372 : : * reserved when data being copied into pagecache.
4373 : : */
4374 [ + + ]: 194441 : if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
4375 : 86503 : ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4376 : : else {
4377 : : /* Without delayed allocation we need to verify
4378 : : * there is enough free blocks to do block allocation
4379 : : * and verify allocation doesn't exceed the quota limits.
4380 : : */
4381 [ + - + ]: 216007 : while (ar->len &&
4382 : 107997 : ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
4383 : :
4384 : : /* let others to free the space */
4385 : 0 : cond_resched();
4386 : 0 : ar->len = ar->len >> 1;
4387 : : }
4388 [ - + ]: 108010 : if (!ar->len) {
4389 : 0 : *errp = -ENOSPC;
4390 : 0 : return 0;
4391 : : }
4392 : : reserv_clstrs = ar->len;
4393 [ + - ]: 108010 : if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4394 : 0 : dquot_alloc_block_nofail(ar->inode,
4395 : 0 : EXT4_C2B(sbi, ar->len));
4396 : : } else {
4397 [ + ][ - + ]: 216088 : while (ar->len &&
4398 : 216048 : dquot_alloc_block(ar->inode,
4399 : 108024 : EXT4_C2B(sbi, ar->len))) {
4400 : :
4401 : 0 : ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4402 : 0 : ar->len--;
4403 : : }
4404 : : }
4405 : 108078 : inquota = ar->len;
4406 [ - + ]: 108078 : if (ar->len == 0) {
4407 : 0 : *errp = -EDQUOT;
4408 : 0 : goto out;
4409 : : }
4410 : : }
4411 : :
4412 : 194581 : ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
4413 [ - + ]: 194577 : if (!ac) {
4414 : 0 : ar->len = 0;
4415 : 0 : *errp = -ENOMEM;
4416 : 0 : goto out;
4417 : : }
4418 : :
4419 : 194577 : *errp = ext4_mb_initialize_context(ac, ar);
4420 [ - + ]: 194526 : if (*errp) {
4421 : 0 : ar->len = 0;
4422 : 0 : goto out;
4423 : : }
4424 : :
4425 : 194526 : ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4426 [ + + ]: 194526 : if (!ext4_mb_use_preallocated(ac)) {
4427 : 66252 : ac->ac_op = EXT4_MB_HISTORY_ALLOC;
4428 : 66252 : ext4_mb_normalize_request(ac, ar);
4429 : : repeat:
4430 : : /* allocate space in core */
4431 : 66267 : *errp = ext4_mb_regular_allocator(ac);
4432 [ + ]: 66323 : if (*errp)
4433 : : goto discard_and_exit;
4434 : :
4435 : : /* as we've just preallocated more space than
4436 : : * user requested originally, we store allocated
4437 : : * space in a special descriptor */
4438 [ + + ][ + + ]: 66326 : if (ac->ac_status == AC_STATUS_FOUND &&
4439 : 66323 : ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4440 : 22519 : *errp = ext4_mb_new_preallocation(ac);
4441 [ + + ]: 66323 : if (*errp) {
4442 : : discard_and_exit:
4443 : : ext4_discard_allocated_blocks(ac);
4444 : : goto errout;
4445 : : }
4446 : : }
4447 [ + - ]: 194576 : if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4448 : 194576 : *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
4449 [ - + ]: 194575 : if (*errp == -EAGAIN) {
4450 : : /*
4451 : : * drop the reference that we took
4452 : : * in ext4_mb_use_best_found
4453 : : */
4454 : 0 : ext4_mb_release_context(ac);
4455 : 0 : ac->ac_b_ex.fe_group = 0;
4456 : 0 : ac->ac_b_ex.fe_start = 0;
4457 : 0 : ac->ac_b_ex.fe_len = 0;
4458 : 0 : ac->ac_status = AC_STATUS_CONTINUE;
4459 : 0 : goto repeat;
4460 [ - + ]: 194575 : } else if (*errp) {
4461 : : ext4_discard_allocated_blocks(ac);
4462 : : goto errout;
4463 : : } else {
4464 : 194575 : block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4465 : 194575 : ar->len = ac->ac_b_ex.fe_len;
4466 : : }
4467 : : } else {
4468 : 0 : freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
4469 [ # # ]: 0 : if (freed)
4470 : : goto repeat;
4471 : 0 : *errp = -ENOSPC;
4472 : : }
4473 : :
4474 : : errout:
4475 [ - + ]: 194603 : if (*errp) {
4476 : 0 : ac->ac_b_ex.fe_len = 0;
4477 : 0 : ar->len = 0;
4478 : : ext4_mb_show_ac(ac);
4479 : : }
4480 : 194603 : ext4_mb_release_context(ac);
4481 : : out:
4482 [ + ]: 194523 : if (ac)
4483 : 194558 : kmem_cache_free(ext4_ac_cachep, ac);
4484 [ + + ][ - + ]: 194547 : if (inquota && ar->len < inquota)
4485 : 0 : dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
4486 [ - + ]: 194547 : if (!ar->len) {
4487 [ # # ]: 0 : if (!ext4_test_inode_state(ar->inode,
4488 : : EXT4_STATE_DELALLOC_RESERVED))
4489 : : /* release all the reserved blocks if non delalloc */
4490 : 0 : percpu_counter_sub(&sbi->s_dirtyclusters_counter,
4491 : : reserv_clstrs);
4492 : : }
4493 : :
4494 : : trace_ext4_allocate_blocks(ar, (unsigned long long)block);
4495 : :
4496 : 194547 : return block;
4497 : : }
4498 : :
4499 : : /*
4500 : : * We can merge two free data extents only if the physical blocks
4501 : : * are contiguous, AND the extents were freed by the same transaction,
4502 : : * AND the blocks are associated with the same group.
4503 : : */
4504 : : static int can_merge(struct ext4_free_data *entry1,
4505 : : struct ext4_free_data *entry2)
4506 : : {
4507 [ + + ][ + ]: 179040 : if ((entry1->efd_tid == entry2->efd_tid) &&
[ + + ][ + - ]
4508 [ + + ][ + + ]: 179040 : (entry1->efd_group == entry2->efd_group) &&
4509 : 176904 : ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
4510 : : return 1;
4511 : : return 0;
4512 : : }
4513 : :
4514 : : static noinline_for_stack int
4515 : 0 : ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4516 : : struct ext4_free_data *new_entry)
4517 : : {
4518 : 110070 : ext4_group_t group = e4b->bd_group;
4519 : : ext4_grpblk_t cluster;
4520 : : struct ext4_free_data *entry;
4521 : 110070 : struct ext4_group_info *db = e4b->bd_info;
4522 : 110070 : struct super_block *sb = e4b->bd_sb;
4523 : : struct ext4_sb_info *sbi = EXT4_SB(sb);
4524 : 110070 : struct rb_node **n = &db->bb_free_root.rb_node, *node;
4525 : : struct rb_node *parent = NULL, *new_node;
4526 : :
4527 [ - + ]: 110070 : BUG_ON(!ext4_handle_valid(handle));
4528 [ - + ]: 110070 : BUG_ON(e4b->bd_bitmap_page == NULL);
4529 [ - + ]: 110070 : BUG_ON(e4b->bd_buddy_page == NULL);
4530 : :
4531 : 110070 : new_node = &new_entry->efd_node;
4532 : 110070 : cluster = new_entry->efd_start_cluster;
4533 : :
4534 [ + + ]: 110070 : if (!*n) {
4535 : : /* first free block exent. We need to
4536 : : protect buddy cache from being freed,
4537 : : * otherwise we'll refresh it from
4538 : : * on-disk bitmap and lose not-yet-available
4539 : : * blocks */
4540 : : page_cache_get(e4b->bd_buddy_page);
4541 : 3260 : page_cache_get(e4b->bd_bitmap_page);
4542 : : }
4543 [ + + ]: 606258 : while (*n) {
4544 : : parent = *n;
4545 : : entry = rb_entry(parent, struct ext4_free_data, efd_node);
4546 [ + + ]: 496187 : if (cluster < entry->efd_start_cluster)
4547 : 318097 : n = &(*n)->rb_left;
4548 [ + ]: 178090 : else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
4549 : 178130 : n = &(*n)->rb_right;
4550 : : else {
4551 : 0 : ext4_grp_locked_error(sb, group, 0,
4552 : : ext4_group_first_block_no(sb, group) +
4553 : : EXT4_C2B(sbi, cluster),
4554 : : "Block already on to-be-freed list");
4555 : 496227 : return 0;
4556 : : }
4557 : : }
4558 : :
4559 : : rb_link_node(new_node, parent, n);
4560 : 110071 : rb_insert_color(new_node, &db->bb_free_root);
4561 : :
4562 : : /* Now try to see the extent can be merged to left and right */
4563 : 110065 : node = rb_prev(new_node);
4564 [ + + ]: 110071 : if (node) {
4565 : 85934 : entry = rb_entry(node, struct ext4_free_data, efd_node);
4566 [ + + + - ]: 122309 : if (can_merge(entry, new_entry) &&
4567 : : ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4568 : 36375 : new_entry->efd_start_cluster = entry->efd_start_cluster;
4569 : 36375 : new_entry->efd_count += entry->efd_count;
4570 : 36375 : rb_erase(node, &(db->bb_free_root));
4571 : 36375 : kmem_cache_free(ext4_free_data_cachep, entry);
4572 : : }
4573 : : }
4574 : :
4575 : 110071 : node = rb_next(new_node);
4576 [ + + ]: 110076 : if (node) {
4577 : 93106 : entry = rb_entry(node, struct ext4_free_data, efd_node);
4578 [ + + + - ]: 98711 : if (can_merge(new_entry, entry) &&
4579 : : ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4580 : 5605 : new_entry->efd_count += entry->efd_count;
4581 : 5605 : rb_erase(node, &(db->bb_free_root));
4582 : 5605 : kmem_cache_free(ext4_free_data_cachep, entry);
4583 : : }
4584 : : }
4585 : : /* Add the extent to transaction's private list */
4586 : : ext4_journal_callback_add(handle, ext4_free_data_callback,
4587 : : &new_entry->efd_jce);
4588 : 110083 : return 0;
4589 : : }
4590 : :
4591 : : /**
4592 : : * ext4_free_blocks() -- Free given blocks and update quota
4593 : : * @handle: handle for this transaction
4594 : : * @inode: inode
4595 : : * @block: start physical block to free
4596 : : * @count: number of blocks to count
4597 : : * @flags: flags used by ext4_free_blocks
4598 : : */
4599 : 0 : void ext4_free_blocks(handle_t *handle, struct inode *inode,
4600 : : struct buffer_head *bh, ext4_fsblk_t block,
4601 : : unsigned long count, int flags)
4602 : : {
4603 : : struct buffer_head *bitmap_bh = NULL;
4604 : 440237 : struct super_block *sb = inode->i_sb;
4605 : : struct ext4_group_desc *gdp;
4606 : : unsigned int overflow;
4607 : : ext4_grpblk_t bit;
4608 : : struct buffer_head *gd_bh;
4609 : : ext4_group_t block_group;
4610 : : struct ext4_sb_info *sbi;
4611 : : struct ext4_inode_info *ei = EXT4_I(inode);
4612 : : struct ext4_buddy e4b;
4613 : : unsigned int count_clusters;
4614 : : int err = 0;
4615 : : int ret;
4616 : :
4617 : : might_sleep();
4618 [ - + ]: 110062 : if (bh) {
4619 [ # # ]: 0 : if (block)
4620 [ # # ]: 0 : BUG_ON(block != bh->b_blocknr);
4621 : : else
4622 : 0 : block = bh->b_blocknr;
4623 : : }
4624 : :
4625 : : sbi = EXT4_SB(sb);
4626 [ + - + ]: 220129 : if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4627 : 110067 : !ext4_data_block_valid(sbi, block, count)) {
4628 : 0 : ext4_error(sb, "Freeing blocks not in datazone - "
4629 : : "block = %llu, count = %lu", block, count);
4630 : 0 : goto error_return;
4631 : : }
4632 : :
4633 : : ext4_debug("freeing block %llu\n", block);
4634 : : trace_ext4_free_blocks(inode, block, count, flags);
4635 : :
4636 [ + + ]: 220118 : if (flags & EXT4_FREE_BLOCKS_FORGET) {
4637 : : struct buffer_head *tbh = bh;
4638 : : int i;
4639 : :
4640 [ + - ]: 38248 : BUG_ON(bh && (count > 1));
4641 : :
4642 [ + + ]: 76531 : for (i = 0; i < count; i++) {
4643 : 38281 : cond_resched();
4644 [ + ]: 38280 : if (!bh)
4645 : 38283 : tbh = sb_find_get_block(inode->i_sb,
4646 : : block + i);
4647 [ + + ]: 38277 : if (!tbh)
4648 : 2421 : continue;
4649 : 35856 : ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4650 : : inode, tbh, block + i);
4651 : : }
4652 : : }
4653 : :
4654 : : /*
4655 : : * We need to make sure we don't reuse the freed block until
4656 : : * after the transaction is committed, which we can do by
4657 : : * treating the block as metadata, below. We make an
4658 : : * exception if the inode is to be written in writeback mode
4659 : : * since writeback mode has weak data consistency guarantees.
4660 : : */
4661 [ + + ]: 110058 : if (!ext4_should_writeback_data(inode))
4662 : 110055 : flags |= EXT4_FREE_BLOCKS_METADATA;
4663 : :
4664 : : /*
4665 : : * If the extent to be freed does not begin on a cluster
4666 : : * boundary, we need to deal with partial clusters at the
4667 : : * beginning and end of the extent. Normally we will free
4668 : : * blocks at the beginning or the end unless we are explicitly
4669 : : * requested to avoid doing so.
4670 : : */
4671 : 110058 : overflow = EXT4_PBLK_COFF(sbi, block);
4672 [ - + ]: 110058 : if (overflow) {
4673 [ # # ]: 0 : if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
4674 : 0 : overflow = sbi->s_cluster_ratio - overflow;
4675 : 0 : block += overflow;
4676 [ # # ]: 0 : if (count > overflow)
4677 : 0 : count -= overflow;
4678 : : else
4679 : : return;
4680 : : } else {
4681 : 0 : block -= overflow;
4682 : 0 : count += overflow;
4683 : : }
4684 : : }
4685 : 110058 : overflow = EXT4_LBLK_COFF(sbi, count);
4686 [ - + ]: 110058 : if (overflow) {
4687 [ # # ]: 0 : if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
4688 [ # # ]: 0 : if (count > overflow)
4689 : 0 : count -= overflow;
4690 : : else
4691 : : return;
4692 : : } else
4693 : 110058 : count += sbi->s_cluster_ratio - overflow;
4694 : : }
4695 : :
4696 : : do_more:
4697 : : overflow = 0;
4698 : 110061 : ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4699 : :
4700 [ + + ]: 110069 : if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
4701 : : ext4_get_group_info(sb, block_group))))
4702 : : return;
4703 : :
4704 : : /*
4705 : : * Check to see if we are freeing blocks across a group
4706 : : * boundary.
4707 : : */
4708 [ + + ]: 110063 : if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4709 : 3 : overflow = EXT4_C2B(sbi, bit) + count -
4710 : : EXT4_BLOCKS_PER_GROUP(sb);
4711 : 3 : count -= overflow;
4712 : : }
4713 : 110063 : count_clusters = EXT4_NUM_B2C(sbi, count);
4714 : 110063 : bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4715 [ + ]: 110051 : if (!bitmap_bh) {
4716 : : err = -EIO;
4717 : : goto error_return;
4718 : : }
4719 : 110054 : gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
4720 [ + ]: 110046 : if (!gdp) {
4721 : : err = -EIO;
4722 : : goto error_return;
4723 : : }
4724 : :
4725 [ - + ]: 220074 : if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
[ # # - + ]
4726 [ # # + ]: 220090 : in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4727 [ + + ]: 220071 : in_range(block, ext4_inode_table(sb, gdp),
4728 [ + ]: 110040 : EXT4_SB(sb)->s_itb_per_group) ||
4729 [ + ]: 220090 : in_range(block + count - 1, ext4_inode_table(sb, gdp),
4730 : : EXT4_SB(sb)->s_itb_per_group)) {
4731 : :
4732 : 0 : ext4_error(sb, "Freeing blocks in system zone - "
4733 : : "Block = %llu, count = %lu", block, count);
4734 : : /* err = 0. ext4_std_error should be a no op */
4735 : 0 : goto error_return;
4736 : : }
4737 : :
4738 : : BUFFER_TRACE(bitmap_bh, "getting write access");
4739 : 110066 : err = ext4_journal_get_write_access(handle, bitmap_bh);
4740 [ + ]: 110036 : if (err)
4741 : : goto error_return;
4742 : :
4743 : : /*
4744 : : * We are about to modify some metadata. Call the journal APIs
4745 : : * to unshare ->b_data if a currently-committing transaction is
4746 : : * using it
4747 : : */
4748 : : BUFFER_TRACE(gd_bh, "get_write_access");
4749 : 110037 : err = ext4_journal_get_write_access(handle, gd_bh);
4750 [ + + ]: 110082 : if (err)
4751 : : goto error_return;
4752 : : #ifdef AGGRESSIVE_CHECK
4753 : : {
4754 : : int i;
4755 : : for (i = 0; i < count_clusters; i++)
4756 : : BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4757 : : }
4758 : : #endif
4759 : 110032 : trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
4760 : :
4761 : 110032 : err = ext4_mb_load_buddy(sb, block_group, &e4b);
4762 [ + ]: 110049 : if (err)
4763 : : goto error_return;
4764 : :
4765 [ + + ][ + ]: 330194 : if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
4766 : : struct ext4_free_data *new_entry;
4767 : : /*
4768 : : * blocks being freed are metadata. these blocks shouldn't
4769 : : * be used until this transaction is committed
4770 : : */
4771 : : retry:
4772 : 110057 : new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
4773 [ - + ]: 110070 : if (!new_entry) {
4774 : : /*
4775 : : * We use a retry loop because
4776 : : * ext4_free_blocks() is not allowed to fail.
4777 : : */
4778 : 0 : cond_resched();
4779 : 0 : congestion_wait(BLK_RW_ASYNC, HZ/50);
4780 : 0 : goto retry;
4781 : : }
4782 : 110070 : new_entry->efd_start_cluster = bit;
4783 : 110070 : new_entry->efd_group = block_group;
4784 : 110070 : new_entry->efd_count = count_clusters;
4785 : 110070 : new_entry->efd_tid = handle->h_transaction->t_tid;
4786 : :
4787 : : ext4_lock_group(sb, block_group);
4788 : 110056 : mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4789 : 110071 : ext4_mb_free_metadata(handle, &e4b, new_entry);
4790 : : } else {
4791 : : /* need to update group_info->bb_free and bitmap
4792 : : * with group lock held. generate_buddy look at
4793 : : * them with group lock_held
4794 : : */
4795 [ # # ]: 0 : if (test_opt(sb, DISCARD)) {
4796 : 0 : err = ext4_issue_discard(sb, block_group, bit, count);
4797 [ # # ]: 0 : if (err && err != -EOPNOTSUPP)
4798 : 0 : ext4_msg(sb, KERN_WARNING, "discard request in"
4799 : : " group:%d block:%d count:%lu failed"
4800 : : " with %d", block_group, bit, count,
4801 : : err);
4802 : : } else
4803 : 0 : EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
4804 : :
4805 : 0 : ext4_lock_group(sb, block_group);
4806 : 0 : mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4807 : 0 : mb_free_blocks(inode, &e4b, bit, count_clusters);
4808 : : }
4809 : :
4810 : 110083 : ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
4811 : 110083 : ext4_free_group_clusters_set(sb, gdp, ret);
4812 : 110081 : ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
4813 : 110080 : ext4_group_desc_csum_set(sb, block_group, gdp);
4814 : 110068 : ext4_unlock_group(sb, block_group);
4815 : :
4816 [ + + ]: 110077 : if (sbi->s_log_groups_per_flex) {
4817 : 110072 : ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4818 : 220144 : atomic64_add(count_clusters,
4819 : 110072 : &sbi->s_flex_groups[flex_group].free_clusters);
4820 : : }
4821 : :
4822 [ + + ][ + - ]: 110068 : if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) {
4823 : 191 : percpu_counter_add(&sbi->s_dirtyclusters_counter,
4824 : : count_clusters);
4825 : : spin_lock(&ei->i_block_reservation_lock);
4826 [ + - ]: 191 : if (flags & EXT4_FREE_BLOCKS_METADATA)
4827 : 191 : ei->i_reserved_meta_blocks += count_clusters;
4828 : : else
4829 : 0 : ei->i_reserved_data_blocks += count_clusters;
4830 : : spin_unlock(&ei->i_block_reservation_lock);
4831 [ + - ]: 191 : if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4832 : 191 : dquot_reclaim_block(inode,
4833 : 191 : EXT4_C2B(sbi, count_clusters));
4834 [ + - ]: 109877 : } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4835 : 109877 : dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4836 : 110005 : percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
4837 : :
4838 : 110075 : ext4_mb_unload_buddy(&e4b);
4839 : :
4840 : : /* We dirtied the bitmap block */
4841 : : BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4842 : 110023 : err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4843 : :
4844 : : /* And the group descriptor block */
4845 : : BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4846 : 110049 : ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4847 [ + ]: 110079 : if (!err)
4848 : : err = ret;
4849 : :
4850 [ + + ]: 110079 : if (overflow && !err) {
4851 : : block += count;
4852 : : count = overflow;
4853 : : put_bh(bitmap_bh);
4854 : : goto do_more;
4855 : : }
4856 : : error_return:
4857 : : brelse(bitmap_bh);
4858 [ - + ]: 110055 : ext4_std_error(sb, err);
4859 : : return;
4860 : : }
4861 : :
4862 : : /**
4863 : : * ext4_group_add_blocks() -- Add given blocks to an existing group
4864 : : * @handle: handle to this transaction
4865 : : * @sb: super block
4866 : : * @block: start physical block to add to the block group
4867 : : * @count: number of blocks to free
4868 : : *
4869 : : * This marks the blocks as free in the bitmap and buddy.
4870 : : */
4871 : 0 : int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
4872 : : ext4_fsblk_t block, unsigned long count)
4873 : : {
4874 : : struct buffer_head *bitmap_bh = NULL;
4875 : : struct buffer_head *gd_bh;
4876 : : ext4_group_t block_group;
4877 : : ext4_grpblk_t bit;
4878 : : unsigned int i;
4879 : : struct ext4_group_desc *desc;
4880 : : struct ext4_sb_info *sbi = EXT4_SB(sb);
4881 : : struct ext4_buddy e4b;
4882 : : int err = 0, ret, blk_free_count;
4883 : : ext4_grpblk_t blocks_freed;
4884 : :
4885 : : ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4886 : :
4887 [ # # ]: 0 : if (count == 0)
4888 : : return 0;
4889 : :
4890 : 0 : ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4891 : : /*
4892 : : * Check to see if we are freeing blocks across a group
4893 : : * boundary.
4894 : : */
4895 [ # # ]: 0 : if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4896 : 0 : ext4_warning(sb, "too much blocks added to group %u\n",
4897 : : block_group);
4898 : : err = -EINVAL;
4899 : 0 : goto error_return;
4900 : : }
4901 : :
4902 : 0 : bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4903 [ # # ]: 0 : if (!bitmap_bh) {
4904 : : err = -EIO;
4905 : : goto error_return;
4906 : : }
4907 : :
4908 : 0 : desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4909 [ # # ]: 0 : if (!desc) {
4910 : : err = -EIO;
4911 : : goto error_return;
4912 : : }
4913 : :
4914 [ # # ]: 0 : if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
[ # # # # ]
4915 [ # # # # ]: 0 : in_range(ext4_inode_bitmap(sb, desc), block, count) ||
4916 [ # # # # ]: 0 : in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
4917 [ # # ]: 0 : in_range(block + count - 1, ext4_inode_table(sb, desc),
4918 : : sbi->s_itb_per_group)) {
4919 : 0 : ext4_error(sb, "Adding blocks in system zones - "
4920 : : "Block = %llu, count = %lu",
4921 : : block, count);
4922 : : err = -EINVAL;
4923 : 0 : goto error_return;
4924 : : }
4925 : :
4926 : : BUFFER_TRACE(bitmap_bh, "getting write access");
4927 : 0 : err = ext4_journal_get_write_access(handle, bitmap_bh);
4928 [ # # ]: 0 : if (err)
4929 : : goto error_return;
4930 : :
4931 : : /*
4932 : : * We are about to modify some metadata. Call the journal APIs
4933 : : * to unshare ->b_data if a currently-committing transaction is
4934 : : * using it
4935 : : */
4936 : : BUFFER_TRACE(gd_bh, "get_write_access");
4937 : 0 : err = ext4_journal_get_write_access(handle, gd_bh);
4938 [ # # ]: 0 : if (err)
4939 : : goto error_return;
4940 : :
4941 [ # # ]: 0 : for (i = 0, blocks_freed = 0; i < count; i++) {
4942 : : BUFFER_TRACE(bitmap_bh, "clear bit");
4943 [ # # ]: 0 : if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
4944 : 0 : ext4_error(sb, "bit already cleared for block %llu",
4945 : : (ext4_fsblk_t)(block + i));
4946 : : BUFFER_TRACE(bitmap_bh, "bit already cleared");
4947 : : } else {
4948 : 0 : blocks_freed++;
4949 : : }
4950 : : }
4951 : :
4952 : 0 : err = ext4_mb_load_buddy(sb, block_group, &e4b);
4953 [ # # ]: 0 : if (err)
4954 : : goto error_return;
4955 : :
4956 : : /*
4957 : : * need to update group_info->bb_free and bitmap
4958 : : * with group lock held. generate_buddy look at
4959 : : * them with group lock_held
4960 : : */
4961 : 0 : ext4_lock_group(sb, block_group);
4962 : 0 : mb_clear_bits(bitmap_bh->b_data, bit, count);
4963 : 0 : mb_free_blocks(NULL, &e4b, bit, count);
4964 : 0 : blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
4965 : 0 : ext4_free_group_clusters_set(sb, desc, blk_free_count);
4966 : 0 : ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
4967 : 0 : ext4_group_desc_csum_set(sb, block_group, desc);
4968 : 0 : ext4_unlock_group(sb, block_group);
4969 : 0 : percpu_counter_add(&sbi->s_freeclusters_counter,
4970 : 0 : EXT4_NUM_B2C(sbi, blocks_freed));
4971 : :
4972 [ # # ]: 0 : if (sbi->s_log_groups_per_flex) {
4973 : 0 : ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4974 : 0 : atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
4975 : 0 : &sbi->s_flex_groups[flex_group].free_clusters);
4976 : : }
4977 : :
4978 : 0 : ext4_mb_unload_buddy(&e4b);
4979 : :
4980 : : /* We dirtied the bitmap block */
4981 : : BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4982 : 0 : err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4983 : :
4984 : : /* And the group descriptor block */
4985 : : BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4986 : 0 : ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4987 [ # # ]: 0 : if (!err)
4988 : : err = ret;
4989 : :
4990 : : error_return:
4991 : : brelse(bitmap_bh);
4992 [ # # ]: 0 : ext4_std_error(sb, err);
4993 : 0 : return err;
4994 : : }
4995 : :
4996 : : /**
4997 : : * ext4_trim_extent -- function to TRIM one single free extent in the group
4998 : : * @sb: super block for the file system
4999 : : * @start: starting block of the free extent in the alloc. group
5000 : : * @count: number of blocks to TRIM
5001 : : * @group: alloc. group we are working with
5002 : : * @e4b: ext4 buddy for the group
5003 : : *
5004 : : * Trim "count" blocks starting at "start" in the "group". To assure that no
5005 : : * one will allocate those blocks, mark it as used in buddy bitmap. This must
5006 : : * be called with under the group lock.
5007 : : */
5008 : 0 : static int ext4_trim_extent(struct super_block *sb, int start, int count,
5009 : : ext4_group_t group, struct ext4_buddy *e4b)
5010 : : {
5011 : : struct ext4_free_extent ex;
5012 : : int ret = 0;
5013 : :
5014 : : trace_ext4_trim_extent(sb, group, start, count);
5015 : :
5016 [ # # ]: 0 : assert_spin_locked(ext4_group_lock_ptr(sb, group));
5017 : :
5018 : 0 : ex.fe_start = start;
5019 : 0 : ex.fe_group = group;
5020 : 0 : ex.fe_len = count;
5021 : :
5022 : : /*
5023 : : * Mark blocks used, so no one can reuse them while
5024 : : * being trimmed.
5025 : : */
5026 : 0 : mb_mark_used(e4b, &ex);
5027 : : ext4_unlock_group(sb, group);
5028 : : ret = ext4_issue_discard(sb, group, start, count);
5029 : : ext4_lock_group(sb, group);
5030 : 0 : mb_free_blocks(NULL, e4b, start, ex.fe_len);
5031 : 0 : return ret;
5032 : : }
5033 : :
5034 : : /**
5035 : : * ext4_trim_all_free -- function to trim all free space in alloc. group
5036 : : * @sb: super block for file system
5037 : : * @group: group to be trimmed
5038 : : * @start: first group block to examine
5039 : : * @max: last group block to examine
5040 : : * @minblocks: minimum extent block count
5041 : : *
5042 : : * ext4_trim_all_free walks through group's buddy bitmap searching for free
5043 : : * extents. When the free block is found, ext4_trim_extent is called to TRIM
5044 : : * the extent.
5045 : : *
5046 : : *
5047 : : * ext4_trim_all_free walks through group's block bitmap searching for free
5048 : : * extents. When the free extent is found, mark it as used in group buddy
5049 : : * bitmap. Then issue a TRIM command on this extent and free the extent in
5050 : : * the group buddy bitmap. This is done until whole group is scanned.
5051 : : */
5052 : : static ext4_grpblk_t
5053 : 0 : ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
5054 : : ext4_grpblk_t start, ext4_grpblk_t max,
5055 : : ext4_grpblk_t minblocks)
5056 : : {
5057 : : void *bitmap;
5058 : : ext4_grpblk_t next, count = 0, free_count = 0;
5059 : : struct ext4_buddy e4b;
5060 : : int ret = 0;
5061 : :
5062 : : trace_ext4_trim_all_free(sb, group, start, max);
5063 : :
5064 : 0 : ret = ext4_mb_load_buddy(sb, group, &e4b);
5065 [ # # ]: 0 : if (ret) {
5066 : 0 : ext4_error(sb, "Error in loading buddy "
5067 : : "information for %u", group);
5068 : 0 : return ret;
5069 : : }
5070 : 0 : bitmap = e4b.bd_bitmap;
5071 : :
5072 : : ext4_lock_group(sb, group);
5073 [ # # ][ # # ]: 0 : if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
5074 : 0 : minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
5075 : : goto out;
5076 : :
5077 : 0 : start = (e4b.bd_info->bb_first_free > start) ?
5078 : 0 : e4b.bd_info->bb_first_free : start;
5079 : :
5080 [ # # ]: 0 : while (start <= max) {
5081 : 0 : start = mb_find_next_zero_bit(bitmap, max + 1, start);
5082 [ # # ]: 0 : if (start > max)
5083 : : break;
5084 : : next = mb_find_next_bit(bitmap, max + 1, start);
5085 : :
5086 [ # # ]: 0 : if ((next - start) >= minblocks) {
5087 : 0 : ret = ext4_trim_extent(sb, start,
5088 : : next - start, group, &e4b);
5089 [ # # ]: 0 : if (ret && ret != -EOPNOTSUPP)
5090 : : break;
5091 : : ret = 0;
5092 : 0 : count += next - start;
5093 : : }
5094 : 0 : free_count += next - start;
5095 : 0 : start = next + 1;
5096 : :
5097 [ # # ]: 0 : if (fatal_signal_pending(current)) {
5098 : : count = -ERESTARTSYS;
5099 : : break;
5100 : : }
5101 : :
5102 [ # # ]: 0 : if (need_resched()) {
5103 : : ext4_unlock_group(sb, group);
5104 : 0 : cond_resched();
5105 : : ext4_lock_group(sb, group);
5106 : : }
5107 : :
5108 [ # # ]: 0 : if ((e4b.bd_info->bb_free - free_count) < minblocks)
5109 : : break;
5110 : : }
5111 : :
5112 [ # # ]: 0 : if (!ret) {
5113 : : ret = count;
5114 : 0 : EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
5115 : : }
5116 : : out:
5117 : : ext4_unlock_group(sb, group);
5118 : 0 : ext4_mb_unload_buddy(&e4b);
5119 : :
5120 : : ext4_debug("trimmed %d blocks in the group %d\n",
5121 : : count, group);
5122 : :
5123 : 0 : return ret;
5124 : : }
5125 : :
5126 : : /**
5127 : : * ext4_trim_fs() -- trim ioctl handle function
5128 : : * @sb: superblock for filesystem
5129 : : * @range: fstrim_range structure
5130 : : *
5131 : : * start: First Byte to trim
5132 : : * len: number of Bytes to trim from start
5133 : : * minlen: minimum extent length in Bytes
5134 : : * ext4_trim_fs goes through all allocation groups containing Bytes from
5135 : : * start to start+len. For each such a group ext4_trim_all_free function
5136 : : * is invoked to trim all free space.
5137 : : */
5138 : 0 : int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5139 : : {
5140 : : struct ext4_group_info *grp;
5141 : : ext4_group_t group, first_group, last_group;
5142 : : ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
5143 : : uint64_t start, end, minlen, trimmed = 0;
5144 : 0 : ext4_fsblk_t first_data_blk =
5145 : 0 : le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
5146 : 0 : ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
5147 : : int ret = 0;
5148 : :
5149 : 0 : start = range->start >> sb->s_blocksize_bits;
5150 : 0 : end = start + (range->len >> sb->s_blocksize_bits) - 1;
5151 : 0 : minlen = EXT4_NUM_B2C(EXT4_SB(sb),
5152 : : range->minlen >> sb->s_blocksize_bits);
5153 : :
5154 [ # # ][ # # ]: 0 : if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
5155 [ # # ]: 0 : start >= max_blks ||
5156 : 0 : range->len < sb->s_blocksize)
5157 : : return -EINVAL;
5158 [ # # ]: 0 : if (end >= max_blks)
5159 : 0 : end = max_blks - 1;
5160 [ # # ]: 0 : if (end <= first_data_blk)
5161 : : goto out;
5162 [ # # ]: 0 : if (start < first_data_blk)
5163 : : start = first_data_blk;
5164 : :
5165 : : /* Determine first and last group to examine based on start and end */
5166 : 0 : ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
5167 : : &first_group, &first_cluster);
5168 : 0 : ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
5169 : : &last_group, &last_cluster);
5170 : :
5171 : : /* end now represents the last cluster to discard in this group */
5172 : 0 : end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
5173 : :
5174 [ # # ]: 0 : for (group = first_group; group <= last_group; group++) {
5175 : : grp = ext4_get_group_info(sb, group);
5176 : : /* We only do this if the grp has never been initialized */
5177 [ # # ]: 0 : if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
5178 : 0 : ret = ext4_mb_init_group(sb, group);
5179 [ # # ]: 0 : if (ret)
5180 : : break;
5181 : : }
5182 : :
5183 : : /*
5184 : : * For all the groups except the last one, last cluster will
5185 : : * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
5186 : : * change it for the last group, note that last_cluster is
5187 : : * already computed earlier by ext4_get_group_no_and_offset()
5188 : : */
5189 [ # # ]: 0 : if (group == last_group)
5190 : 0 : end = last_cluster;
5191 : :
5192 [ # # ]: 0 : if (grp->bb_free >= minlen) {
5193 : 0 : cnt = ext4_trim_all_free(sb, group, first_cluster,
5194 : : end, minlen);
5195 [ # # ]: 0 : if (cnt < 0) {
5196 : : ret = cnt;
5197 : : break;
5198 : : }
5199 : 0 : trimmed += cnt;
5200 : : }
5201 : :
5202 : : /*
5203 : : * For every group except the first one, we are sure
5204 : : * that the first cluster to discard will be cluster #0.
5205 : : */
5206 : 0 : first_cluster = 0;
5207 : : }
5208 : :
5209 [ # # ]: 0 : if (!ret)
5210 : 0 : atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5211 : :
5212 : : out:
5213 : 0 : range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
5214 : 0 : return ret;
5215 : : }
|