LCOV - coverage.info - mm/vmscan.c

LCOV - code coverage report

Current view:	top level - mm - vmscan.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	532	778	68.4 %
Date:	2014-04-16	Functions:	47	56	83.9 %
		Branches:	402	732	54.9 %

           Branch data     Line data    Source code

       1                 :            : /*
       2                 :            :  *  linux/mm/vmscan.c
       3                 :            :  *
       4                 :            :  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
       5                 :            :  *
       6                 :            :  *  Swap reorganised 29.12.95, Stephen Tweedie.
       7                 :            :  *  kswapd added: 7.1.96  sct
       8                 :            :  *  Removed kswapd_ctl limits, and swap out as many pages as needed
       9                 :            :  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
      10                 :            :  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
      11                 :            :  *  Multiqueue VM started 5.8.00, Rik van Riel.
      12                 :            :  */
      13                 :            : 
      14                 :            : #include <linux/mm.h>
      15                 :            : #include <linux/module.h>
      16                 :            : #include <linux/gfp.h>
      17                 :            : #include <linux/kernel_stat.h>
      18                 :            : #include <linux/swap.h>
      19                 :            : #include <linux/pagemap.h>
      20                 :            : #include <linux/init.h>
      21                 :            : #include <linux/highmem.h>
      22                 :            : #include <linux/vmpressure.h>
      23                 :            : #include <linux/vmstat.h>
      24                 :            : #include <linux/file.h>
      25                 :            : #include <linux/writeback.h>
      26                 :            : #include <linux/blkdev.h>
      27                 :            : #include <linux/buffer_head.h>    /* for try_to_release_page(),
      28                 :            :                                         buffer_heads_over_limit */
      29                 :            : #include <linux/mm_inline.h>
      30                 :            : #include <linux/backing-dev.h>
      31                 :            : #include <linux/rmap.h>
      32                 :            : #include <linux/topology.h>
      33                 :            : #include <linux/cpu.h>
      34                 :            : #include <linux/cpuset.h>
      35                 :            : #include <linux/compaction.h>
      36                 :            : #include <linux/notifier.h>
      37                 :            : #include <linux/rwsem.h>
      38                 :            : #include <linux/delay.h>
      39                 :            : #include <linux/kthread.h>
      40                 :            : #include <linux/freezer.h>
      41                 :            : #include <linux/memcontrol.h>
      42                 :            : #include <linux/delayacct.h>
      43                 :            : #include <linux/sysctl.h>
      44                 :            : #include <linux/oom.h>
      45                 :            : #include <linux/prefetch.h>
      46                 :            : #include <linux/debugfs.h>
      47                 :            : 
      48                 :            : #include <asm/tlbflush.h>
      49                 :            : #include <asm/div64.h>
      50                 :            : 
      51                 :            : #include <linux/swapops.h>
      52                 :            : #include <linux/balloon_compaction.h>
      53                 :            : 
      54                 :            : #include "internal.h"
      55                 :            : 
      56                 :            : #define CREATE_TRACE_POINTS
      57                 :            : #include <trace/events/vmscan.h>
      58                 :            : 
      59                 :            : struct scan_control {
      60                 :            :         /* Incremented by the number of inactive pages that were scanned */
      61                 :            :         unsigned long nr_scanned;
      62                 :            : 
      63                 :            :         /* Number of pages freed so far during a call to shrink_zones() */
      64                 :            :         unsigned long nr_reclaimed;
      65                 :            : 
      66                 :            :         /* How many pages shrink_list() should reclaim */
      67                 :            :         unsigned long nr_to_reclaim;
      68                 :            : 
      69                 :            :         unsigned long hibernation_mode;
      70                 :            : 
      71                 :            :         /* This context's GFP mask */
      72                 :            :         gfp_t gfp_mask;
      73                 :            : 
      74                 :            :         int may_writepage;
      75                 :            : 
      76                 :            :         /* Can mapped pages be reclaimed? */
      77                 :            :         int may_unmap;
      78                 :            : 
      79                 :            :         /* Can pages be swapped as part of reclaim? */
      80                 :            :         int may_swap;
      81                 :            : 
      82                 :            :         int order;
      83                 :            : 
      84                 :            :         /* Scan (total_size >> priority) pages at once */
      85                 :            :         int priority;
      86                 :            : 
      87                 :            :         /*
      88                 :            :          * The memory cgroup that hit its limit and as a result is the
      89                 :            :          * primary target of this reclaim invocation.
      90                 :            :          */
      91                 :            :         struct mem_cgroup *target_mem_cgroup;
      92                 :            : 
      93                 :            :         /*
      94                 :            :          * Nodemask of nodes allowed by the caller. If NULL, all nodes
      95                 :            :          * are scanned.
      96                 :            :          */
      97                 :            :         nodemask_t      *nodemask;
      98                 :            : };
      99                 :            : 
     100                 :            : #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
     101                 :            : 
     102                 :            : #ifdef ARCH_HAS_PREFETCH
     103                 :            : #define prefetch_prev_lru_page(_page, _base, _field)                    \
     104                 :            :         do {                                                            \
     105                 :            :                 if ((_page)->lru.prev != _base) {                    \
     106                 :            :                         struct page *prev;                              \
     107                 :            :                                                                         \
     108                 :            :                         prev = lru_to_page(&(_page->lru));               \
     109                 :            :                         prefetch(&prev->_field);                 \
     110                 :            :                 }                                                       \
     111                 :            :         } while (0)
     112                 :            : #else
     113                 :            : #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
     114                 :            : #endif
     115                 :            : 
     116                 :            : #ifdef ARCH_HAS_PREFETCHW
     117                 :            : #define prefetchw_prev_lru_page(_page, _base, _field)                   \
     118                 :            :         do {                                                            \
     119                 :            :                 if ((_page)->lru.prev != _base) {                    \
     120                 :            :                         struct page *prev;                              \
     121                 :            :                                                                         \
     122                 :            :                         prev = lru_to_page(&(_page->lru));               \
     123                 :            :                         prefetchw(&prev->_field);                        \
     124                 :            :                 }                                                       \
     125                 :            :         } while (0)
     126                 :            : #else
     127                 :            : #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
     128                 :            : #endif
     129                 :            : 
     130                 :            : /*
     131                 :            :  * From 0 .. 100.  Higher means more swappy.
     132                 :            :  */
     133                 :            : int vm_swappiness = 60;
     134                 :            : unsigned long vm_total_pages;   /* The total number of pages which the VM controls */
     135                 :            : 
     136                 :            : static LIST_HEAD(shrinker_list);
     137                 :            : static DECLARE_RWSEM(shrinker_rwsem);
     138                 :            : 
     139                 :            : #ifdef CONFIG_MEMCG
     140                 :            : static bool global_reclaim(struct scan_control *sc)
     141                 :            : {
     142                 :            :         return !sc->target_mem_cgroup;
     143                 :            : }
     144                 :            : #else
     145                 :            : static bool global_reclaim(struct scan_control *sc)
     146                 :            : {
     147                 :            :         return true;
     148                 :            : }
     149                 :            : #endif
     150                 :            : 
     151                 :          0 : static unsigned long zone_reclaimable_pages(struct zone *zone)
     152                 :            : {
     153                 :            :         int nr;
     154                 :            : 
     155                 :     611951 :         nr = zone_page_state(zone, NR_ACTIVE_FILE) +
     156                 :            :              zone_page_state(zone, NR_INACTIVE_FILE);
     157                 :            : 
     158         [ +  - ]:     611951 :         if (get_nr_swap_pages() > 0)
     159                 :          0 :                 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
     160                 :            :                       zone_page_state(zone, NR_INACTIVE_ANON);
     161                 :            : 
     162                 :          0 :         return nr;
     163                 :            : }
     164                 :            : 
     165                 :          0 : bool zone_reclaimable(struct zone *zone)
     166                 :            : {
     167                 :     398149 :         return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
     168                 :            : }
     169                 :            : 
     170                 :            : static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
     171                 :            : {
     172                 :            :         if (!mem_cgroup_disabled())
     173                 :            :                 return mem_cgroup_get_lru_size(lruvec, lru);
     174                 :            : 
     175                 :     644896 :         return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
     176                 :            : }
     177                 :            : 
     178                 :            : struct dentry *debug_file;
     179                 :            : 
     180                 :          0 : static int debug_shrinker_show(struct seq_file *s, void *unused)
     181                 :            : {
     182                 :            :         struct shrinker *shrinker;
     183                 :            :         struct shrink_control sc;
     184                 :            : 
     185                 :          0 :         sc.gfp_mask = -1;
     186                 :          0 :         sc.nr_to_scan = 0;
     187                 :            : 
     188                 :          0 :         down_read(&shrinker_rwsem);
     189         [ #  # ]:          0 :         list_for_each_entry(shrinker, &shrinker_list, list) {
     190                 :            :                 int num_objs;
     191                 :            : 
     192                 :          0 :                 num_objs = shrinker->count_objects(shrinker, &sc);
     193                 :          0 :                 seq_printf(s, "%pf %d\n", shrinker->scan_objects, num_objs);
     194                 :            :         }
     195                 :          0 :         up_read(&shrinker_rwsem);
     196                 :          0 :         return 0;
     197                 :            : }
     198                 :            : 
     199                 :          0 : static int debug_shrinker_open(struct inode *inode, struct file *file)
     200                 :            : {
     201                 :          0 :         return single_open(file, debug_shrinker_show, inode->i_private);
     202                 :            : }
     203                 :            : 
     204                 :            : static const struct file_operations debug_shrinker_fops = {
     205                 :            :         .open = debug_shrinker_open,
     206                 :            :         .read = seq_read,
     207                 :            :         .llseek = seq_lseek,
     208                 :            :         .release = single_release,
     209                 :            : };
     210                 :            : 
     211                 :            : /*
     212                 :            :  * Add a shrinker callback to be called from the vm.
     213                 :            :  */
     214                 :          0 : int register_shrinker(struct shrinker *shrinker)
     215                 :            : {
     216                 :            :         size_t size = sizeof(*shrinker->nr_deferred);
     217                 :            : 
     218                 :            :         /*
     219                 :            :          * If we only have one possible node in the system anyway, save
     220                 :            :          * ourselves the trouble and disable NUMA aware behavior. This way we
     221                 :            :          * will save memory and some small loop time later.
     222                 :            :          */
     223                 :            :         if (nr_node_ids == 1)
     224                 :         28 :                 shrinker->flags &= ~SHRINKER_NUMA_AWARE;
     225                 :            : 
     226                 :            :         if (shrinker->flags & SHRINKER_NUMA_AWARE)
     227                 :            :                 size *= nr_node_ids;
     228                 :            : 
     229                 :         28 :         shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
     230         [ +  - ]:         28 :         if (!shrinker->nr_deferred)
     231                 :            :                 return -ENOMEM;
     232                 :            : 
     233                 :         28 :         down_write(&shrinker_rwsem);
     234                 :         28 :         list_add_tail(&shrinker->list, &shrinker_list);
     235                 :         28 :         up_write(&shrinker_rwsem);
     236                 :         28 :         return 0;
     237                 :            : }
     238                 :            : EXPORT_SYMBOL(register_shrinker);
     239                 :            : 
     240                 :          0 : static int __init add_shrinker_debug(void)
     241                 :            : {
     242                 :          0 :         debugfs_create_file("shrinker", 0644, NULL, NULL,
     243                 :            :                             &debug_shrinker_fops);
     244                 :          0 :         return 0;
     245                 :            : }
     246                 :            : 
     247                 :            : late_initcall(add_shrinker_debug);
     248                 :            : 
     249                 :            : /*
     250                 :            :  * Remove one
     251                 :            :  */
     252                 :          0 : void unregister_shrinker(struct shrinker *shrinker)
     253                 :            : {
     254                 :         28 :         down_write(&shrinker_rwsem);
     255                 :            :         list_del(&shrinker->list);
     256                 :         28 :         up_write(&shrinker_rwsem);
     257                 :         28 :         kfree(shrinker->nr_deferred);
     258                 :         28 : }
     259                 :            : EXPORT_SYMBOL(unregister_shrinker);
     260                 :            : 
     261                 :            : #define SHRINK_BATCH 128
     262                 :            : 
     263                 :            : static unsigned long
     264                 :          0 : shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
     265                 :            :                  unsigned long nr_pages_scanned, unsigned long lru_pages)
     266                 :            : {
     267                 :            :         unsigned long freed = 0;
     268                 :            :         unsigned long long delta;
     269                 :            :         long total_scan;
     270                 :            :         long max_pass;
     271                 :            :         long nr;
     272                 :            :         long new_nr;
     273                 :    3258365 :         int nid = shrinkctl->nid;
     274                 :    3258365 :         long batch_size = shrinker->batch ? shrinker->batch
     275         [ +  + ]:    3258365 :                                           : SHRINK_BATCH;
     276                 :            : 
     277                 :    3258365 :         max_pass = shrinker->count_objects(shrinker, shrinkctl);
     278         [ +  + ]:    3257069 :         if (max_pass == 0)
     279                 :            :                 return 0;
     280                 :            : 
     281                 :            :         /*
     282                 :            :          * copy the current shrinker scan count into a local variable
     283                 :            :          * and zero it so that other concurrent shrinker invocations
     284                 :            :          * don't also do this scanning work.
     285                 :            :          */
     286                 :    3560713 :         nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
     287                 :            : 
     288                 :            :         total_scan = nr;
     289                 :    3409842 :         delta = (4 * nr_pages_scanned) / shrinker->seeks;
     290                 :    3409842 :         delta *= max_pass;
     291 [ -  + ][ #  # ]:    3409842 :         do_div(delta, lru_pages + 1);
         [ -  + ][ -  + ]
         [ -  + ][ -  + ]
         [ -  + ][ -  + ]
         [ -  + ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
                 [ #  # ]
     292                 :     151477 :         total_scan += delta;
     293         [ -  + ]:     151477 :         if (total_scan < 0) {
     294                 :          0 :                 printk(KERN_ERR
     295                 :            :                 "shrink_slab: %pF negative objects to delete nr=%ld\n",
     296                 :            :                        shrinker->scan_objects, total_scan);
     297                 :            :                 total_scan = max_pass;
     298                 :            :         }
     299                 :            : 
     300                 :            :         /*
     301                 :            :          * We need to avoid excessive windup on filesystem shrinkers
     302                 :            :          * due to large numbers of GFP_NOFS allocations causing the
     303                 :            :          * shrinkers to return -1 all the time. This results in a large
     304                 :            :          * nr being built up so when a shrink that can do some work
     305                 :            :          * comes along it empties the entire cache due to nr >>>
     306                 :            :          * max_pass.  This is bad for sustaining a working set in
     307                 :            :          * memory.
     308                 :            :          *
     309                 :            :          * Hence only allow the shrinker to scan the entire cache when
     310                 :            :          * a large delta change is calculated directly.
     311                 :            :          */
     312         [ +  + ]:     151297 :         if (delta < max_pass / 4)
     313                 :      15715 :                 total_scan = min(total_scan, max_pass / 2);
     314                 :            : 
     315                 :            :         /*
     316                 :            :          * Avoid risking looping forever due to too large nr value:
     317                 :            :          * never try to free more than twice the estimate number of
     318                 :            :          * freeable entries.
     319                 :            :          */
     320         [ +  + ]:     151297 :         if (total_scan > max_pass * 2)
     321                 :            :                 total_scan = max_pass * 2;
     322                 :            : 
     323                 :     151297 :         trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
     324                 :            :                                 nr_pages_scanned, lru_pages,
     325                 :            :                                 max_pass, delta, total_scan);
     326                 :            : 
     327                 :            :         /*
     328                 :            :          * Normally, we should not scan less than batch_size objects in one
     329                 :            :          * pass to avoid too frequent shrinker calls, but if the slab has less
     330                 :            :          * than batch_size objects in total and we are really tight on memory,
     331                 :            :          * we will try to reclaim all available objects, otherwise we can end
     332                 :            :          * up failing allocations although there are plenty of reclaimable
     333                 :            :          * objects spread over several slabs with usage less than the
     334                 :            :          * batch_size.
     335                 :            :          *
     336                 :            :          * We detect the "tight on memory" situations by looking at the total
     337                 :            :          * number of objects we want to scan (total_scan). If it is greater
     338                 :            :          * than the total number of objects on slab (max_pass), we must be
     339                 :            :          * scanning at high prio and therefore should try to reclaim as much as
     340                 :            :          * possible.
     341                 :            :          */
     342         [ +  + ]:     278661 :         while (total_scan >= batch_size ||
     343                 :     278661 :                total_scan >= max_pass) {
     344                 :            :                 unsigned long ret;
     345                 :     126952 :                 unsigned long nr_to_scan = min(batch_size, total_scan);
     346                 :            : 
     347                 :     126952 :                 shrinkctl->nr_to_scan = nr_to_scan;
     348                 :     126952 :                 ret = shrinker->scan_objects(shrinker, shrinkctl);
     349         [ +  - ]:     127239 :                 if (ret == SHRINK_STOP)
     350                 :            :                         break;
     351                 :     127239 :                 freed += ret;
     352                 :            : 
     353                 :            :                 count_vm_events(SLABS_SCANNED, nr_to_scan);
     354                 :     127240 :                 total_scan -= nr_to_scan;
     355                 :            : 
     356                 :     127240 :                 cond_resched();
     357                 :            :         }
     358                 :            : 
     359                 :            :         /*
     360                 :            :          * move the unused scan count back into the shrinker in a
     361                 :            :          * manner that handles concurrent updates. If we exhausted the
     362                 :            :          * scan, there is no need to do an update.
     363                 :            :          */
     364         [ +  + ]:     151709 :         if (total_scan > 0)
     365                 :      97018 :                 new_nr = atomic_long_add_return(total_scan,
     366                 :      97018 :                                                 &shrinker->nr_deferred[nid]);
     367                 :            :         else
     368                 :      54691 :                 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
     369                 :            : 
     370                 :     151707 :         trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
     371                 :     151707 :         return freed;
     372                 :            : }
     373                 :            : 
     374                 :            : /*
     375                 :            :  * Call the shrink functions to age shrinkable caches
     376                 :            :  *
     377                 :            :  * Here we assume it costs one seek to replace a lru page and that it also
     378                 :            :  * takes a seek to recreate a cache object.  With this in mind we age equal
     379                 :            :  * percentages of the lru and ageable caches.  This should balance the seeks
     380                 :            :  * generated by these structures.
     381                 :            :  *
     382                 :            :  * If the vm encountered mapped pages on the LRU it increase the pressure on
     383                 :            :  * slab to avoid swapping.
     384                 :            :  *
     385                 :            :  * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
     386                 :            :  *
     387                 :            :  * `lru_pages' represents the number of on-LRU pages in all the zones which
     388                 :            :  * are eligible for the caller's allocation attempt.  It is used for balancing
     389                 :            :  * slab reclaim versus page reclaim.
     390                 :            :  *
     391                 :            :  * Returns the number of slab objects which we shrunk.
     392                 :            :  */
     393                 :          0 : unsigned long shrink_slab(struct shrink_control *shrinkctl,
     394                 :            :                           unsigned long nr_pages_scanned,
     395                 :            :                           unsigned long lru_pages)
     396                 :            : {
     397                 :            :         struct shrinker *shrinker;
     398                 :            :         unsigned long freed = 0;
     399                 :            : 
     400         [ +  + ]:     125783 :         if (nr_pages_scanned == 0)
     401                 :            :                 nr_pages_scanned = SWAP_CLUSTER_MAX;
     402                 :            : 
     403         [ +  + ]:     125783 :         if (!down_read_trylock(&shrinker_rwsem)) {
     404                 :            :                 /*
     405                 :            :                  * If we would return 0, our callers would understand that we
     406                 :            :                  * have nothing else to shrink and give up trying. By returning
     407                 :            :                  * 1 we keep it going and assume we'll be able to shrink next
     408                 :            :                  * time.
     409                 :            :                  */
     410                 :            :                 freed = 1;
     411                 :            :                 goto out;
     412                 :            :         }
     413                 :            : 
     414         [ +  + ]:    3381836 :         list_for_each_entry(shrinker, &shrinker_list, list) {
     415         [ +  + ]:    3256045 :                 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
     416                 :    3255934 :                         shrinkctl->nid = 0;
     417                 :    3255934 :                         freed += shrink_slab_node(shrinkctl, shrinker,
     418                 :            :                                         nr_pages_scanned, lru_pages);
     419                 :    3258440 :                         continue;
     420                 :            :                 }
     421                 :            : 
     422 [ -  + ][ #  # ]:        111 :                 for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
     423         [ #  # ]:          0 :                         if (node_online(shrinkctl->nid))
     424                 :          0 :                                 freed += shrink_slab_node(shrinkctl, shrinker,
     425                 :            :                                                 nr_pages_scanned, lru_pages);
     426                 :            : 
     427                 :            :                 }
     428                 :            :         }
     429                 :     125791 :         up_read(&shrinker_rwsem);
     430                 :            : out:
     431                 :     128297 :         cond_resched();
     432                 :     125790 :         return freed;
     433                 :            : }
     434                 :            : 
     435                 :          4 : static inline int is_page_cache_freeable(struct page *page)
     436                 :            : {
     437                 :            :         /*
     438                 :            :          * A freeable page cache page is referenced only by the caller
     439                 :            :          * that isolated the page, the page cache radix tree and
     440                 :            :          * optional buffer heads at page->private.
     441                 :            :          */
     442                 :          4 :         return page_count(page) - page_has_private(page) == 2;
     443                 :            : }
     444                 :            : 
     445                 :          4 : static int may_write_to_queue(struct backing_dev_info *bdi,
     446                 :            :                               struct scan_control *sc)
     447                 :            : {
     448         [ -  + ]:          4 :         if (current->flags & PF_SWAPWRITE)
     449                 :            :                 return 1;
     450         [ #  # ]:          0 :         if (!bdi_write_congested(bdi))
     451                 :            :                 return 1;
     452         [ #  # ]:          0 :         if (bdi == current->backing_dev_info)
     453                 :            :                 return 1;
     454                 :            :         return 0;
     455                 :            : }
     456                 :            : 
     457                 :            : /*
     458                 :            :  * We detected a synchronous write error writing a page out.  Probably
     459                 :            :  * -ENOSPC.  We need to propagate that into the address_space for a subsequent
     460                 :            :  * fsync(), msync() or close().
     461                 :            :  *
     462                 :            :  * The tricky part is that after writepage we cannot touch the mapping: nothing
     463                 :            :  * prevents it from being freed up.  But we have a ref on the page and once
     464                 :            :  * that page is locked, the mapping is pinned.
     465                 :            :  *
     466                 :            :  * We're allowed to run sleeping lock_page() here because we know the caller has
     467                 :            :  * __GFP_FS.
     468                 :            :  */
     469                 :          0 : static void handle_write_error(struct address_space *mapping,
     470                 :            :                                 struct page *page, int error)
     471                 :            : {
     472                 :            :         lock_page(page);
     473         [ #  # ]:          0 :         if (page_mapping(page) == mapping)
     474                 :            :                 mapping_set_error(mapping, error);
     475                 :          0 :         unlock_page(page);
     476                 :          0 : }
     477                 :            : 
     478                 :            : /* possible outcome of pageout() */
     479                 :            : typedef enum {
     480                 :            :         /* failed to write page out, page is locked */
     481                 :            :         PAGE_KEEP,
     482                 :            :         /* move page to the active list, page is locked */
     483                 :            :         PAGE_ACTIVATE,
     484                 :            :         /* page has been sent to the disk successfully, page is unlocked */
     485                 :            :         PAGE_SUCCESS,
     486                 :            :         /* page is clean and locked */
     487                 :            :         PAGE_CLEAN,
     488                 :            : } pageout_t;
     489                 :            : 
     490                 :            : /*
     491                 :            :  * pageout is called by shrink_page_list() for each dirty page.
     492                 :            :  * Calls ->writepage().
     493                 :            :  */
     494                 :          0 : static pageout_t pageout(struct page *page, struct address_space *mapping,
     495                 :            :                          struct scan_control *sc)
     496                 :            : {
     497                 :            :         /*
     498                 :            :          * If the page is dirty, only perform writeback if that write
     499                 :            :          * will be non-blocking.  To prevent this allocation from being
     500                 :            :          * stalled by pagecache activity.  But note that there may be
     501                 :            :          * stalls if we need to run get_block().  We could test
     502                 :            :          * PagePrivate for that.
     503                 :            :          *
     504                 :            :          * If this process is currently in __generic_file_aio_write() against
     505                 :            :          * this page's queue, we can perform writeback even if that
     506                 :            :          * will block.
     507                 :            :          *
     508                 :            :          * If the page is swapcache, write it back even if that would
     509                 :            :          * block, for some throttling. This happens by accident, because
     510                 :            :          * swap_backing_dev_info is bust: it doesn't reflect the
     511                 :            :          * congestion state of the swapdevs.  Easy to fix, if needed.
     512                 :            :          */
     513         [ +  - ]:          4 :         if (!is_page_cache_freeable(page))
     514                 :            :                 return PAGE_KEEP;
     515         [ -  + ]:          4 :         if (!mapping) {
     516                 :            :                 /*
     517                 :            :                  * Some data journaling orphaned pages can have
     518                 :            :                  * page->mapping == NULL while being dirty with clean buffers.
     519                 :            :                  */
     520         [ #  # ]:          0 :                 if (page_has_private(page)) {
     521         [ #  # ]:          0 :                         if (try_to_free_buffers(page)) {
     522                 :            :                                 ClearPageDirty(page);
     523                 :          0 :                                 printk("%s: orphaned page\n", __func__);
     524                 :            :                                 return PAGE_CLEAN;
     525                 :            :                         }
     526                 :            :                 }
     527                 :            :                 return PAGE_KEEP;
     528                 :            :         }
     529         [ +  - ]:          4 :         if (mapping->a_ops->writepage == NULL)
     530                 :            :                 return PAGE_ACTIVATE;
     531         [ +  - ]:          4 :         if (!may_write_to_queue(mapping->backing_dev_info, sc))
     532                 :            :                 return PAGE_KEEP;
     533                 :            : 
     534         [ +  - ]:          4 :         if (clear_page_dirty_for_io(page)) {
     535                 :            :                 int res;
     536                 :          4 :                 struct writeback_control wbc = {
     537                 :            :                         .sync_mode = WB_SYNC_NONE,
     538                 :            :                         .nr_to_write = SWAP_CLUSTER_MAX,
     539                 :            :                         .range_start = 0,
     540                 :            :                         .range_end = LLONG_MAX,
     541                 :            :                         .for_reclaim = 1,
     542                 :            :                 };
     543                 :            : 
     544                 :            :                 SetPageReclaim(page);
     545                 :          4 :                 res = mapping->a_ops->writepage(page, &wbc);
     546         [ -  + ]:          4 :                 if (res < 0)
     547                 :          0 :                         handle_write_error(mapping, page, res);
     548         [ -  + ]:          4 :                 if (res == AOP_WRITEPAGE_ACTIVATE) {
     549                 :            :                         ClearPageReclaim(page);
     550                 :            :                         return PAGE_ACTIVATE;
     551                 :            :                 }
     552                 :            : 
     553         [ +  - ]:          4 :                 if (!PageWriteback(page)) {
     554                 :            :                         /* synchronous write or broken a_ops? */
     555                 :            :                         ClearPageReclaim(page);
     556                 :            :                 }
     557         [ -  + ]:          8 :                 trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
     558                 :          4 :                 inc_zone_page_state(page, NR_VMSCAN_WRITE);
     559                 :            :                 return PAGE_SUCCESS;
     560                 :            :         }
     561                 :            : 
     562                 :            :         return PAGE_CLEAN;
     563                 :            : }
     564                 :            : 
     565                 :            : /*
     566                 :            :  * Same as remove_mapping, but if the page is removed from the mapping, it
     567                 :            :  * gets returned with a refcount of 0.
     568                 :            :  */
     569                 :          0 : static int __remove_mapping(struct address_space *mapping, struct page *page)
     570                 :            : {
     571         [ -  + ]:     155827 :         BUG_ON(!PageLocked(page));
     572         [ -  + ]:     155827 :         BUG_ON(mapping != page_mapping(page));
     573                 :            : 
     574                 :            :         spin_lock_irq(&mapping->tree_lock);
     575                 :            :         /*
     576                 :            :          * The non racy check for a busy page.
     577                 :            :          *
     578                 :            :          * Must be careful with the order of the tests. When someone has
     579                 :            :          * a ref to the page, it may be possible that they dirty it then
     580                 :            :          * drop the reference. So if PageDirty is tested before page_count
     581                 :            :          * here, then the following race may occur:
     582                 :            :          *
     583                 :            :          * get_user_pages(&page);
     584                 :            :          * [user mapping goes away]
     585                 :            :          * write_to(page);
     586                 :            :          *                              !PageDirty(page)    [good]
     587                 :            :          * SetPageDirty(page);
     588                 :            :          * put_page(page);
     589                 :            :          *                              !page_count(page)   [good, discard it]
     590                 :            :          *
     591                 :            :          * [oops, our write_to data is lost]
     592                 :            :          *
     593                 :            :          * Reversing the order of the tests ensures such a situation cannot
     594                 :            :          * escape unnoticed. The smp_rmb is needed to ensure the page->flags
     595                 :            :          * load is not satisfied before that of page->_count.
     596                 :            :          *
     597                 :            :          * Note that if SetPageDirty is always performed via set_page_dirty,
     598                 :            :          * and thus under tree_lock, then this ordering is not required.
     599                 :            :          */
     600         [ +  + ]:     155827 :         if (!page_freeze_refs(page, 2))
     601                 :            :                 goto cannot_free;
     602                 :            :         /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
     603         [ -  + ]:     155726 :         if (unlikely(PageDirty(page))) {
     604                 :            :                 page_unfreeze_refs(page, 2);
     605                 :            :                 goto cannot_free;
     606                 :            :         }
     607                 :            : 
     608         [ -  + ]:     155726 :         if (PageSwapCache(page)) {
     609                 :          0 :                 swp_entry_t swap = { .val = page_private(page) };
     610                 :          0 :                 __delete_from_swap_cache(page);
     611                 :            :                 spin_unlock_irq(&mapping->tree_lock);
     612                 :          0 :                 swapcache_free(swap, page);
     613                 :            :         } else {
     614                 :            :                 void (*freepage)(struct page *);
     615                 :            : 
     616                 :     155726 :                 freepage = mapping->a_ops->freepage;
     617                 :            : 
     618                 :     155726 :                 __delete_from_page_cache(page);
     619                 :            :                 spin_unlock_irq(&mapping->tree_lock);
     620                 :            :                 mem_cgroup_uncharge_cache_page(page);
     621                 :            : 
     622         [ -  + ]:     155726 :                 if (freepage != NULL)
     623                 :          0 :                         freepage(page);
     624                 :            :         }
     625                 :            : 
     626                 :            :         return 1;
     627                 :            : 
     628                 :            : cannot_free:
     629                 :            :         spin_unlock_irq(&mapping->tree_lock);
     630                 :        101 :         return 0;
     631                 :            : }
     632                 :            : 
     633                 :            : /*
     634                 :            :  * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
     635                 :            :  * someone else has a ref on the page, abort and return 0.  If it was
     636                 :            :  * successfully detached, return 1.  Assumes the caller has a single ref on
     637                 :            :  * this page.
     638                 :            :  */
     639                 :          0 : int remove_mapping(struct address_space *mapping, struct page *page)
     640                 :            : {
     641         [ +  + ]:      72844 :         if (__remove_mapping(mapping, page)) {
     642                 :            :                 /*
     643                 :            :                  * Unfreezing the refcount with 1 rather than 2 effectively
     644                 :            :                  * drops the pagecache ref for us without requiring another
     645                 :            :                  * atomic operation.
     646                 :            :                  */
     647                 :            :                 page_unfreeze_refs(page, 1);
     648                 :      72843 :                 return 1;
     649                 :            :         }
     650                 :            :         return 0;
     651                 :            : }
     652                 :            : 
     653                 :            : /**
     654                 :            :  * putback_lru_page - put previously isolated page onto appropriate LRU list
     655                 :            :  * @page: page to be put back to appropriate lru list
     656                 :            :  *
     657                 :            :  * Add previously isolated @page to appropriate LRU list.
     658                 :            :  * Page may still be unevictable for other reasons.
     659                 :            :  *
     660                 :            :  * lru_lock must not be held, interrupts must be enabled.
     661                 :            :  */
     662                 :          0 : void putback_lru_page(struct page *page)
     663                 :            : {
     664                 :            :         bool is_unevictable;
     665                 :            :         int was_unevictable = PageUnevictable(page);
     666                 :            : 
     667                 :            :         VM_BUG_ON_PAGE(PageLRU(page), page);
     668                 :            : 
     669                 :            : redo:
     670                 :            :         ClearPageUnevictable(page);
     671                 :            : 
     672         [ +  + ]:       5666 :         if (page_evictable(page)) {
     673                 :            :                 /*
     674                 :            :                  * For evictable pages, we can use the cache.
     675                 :            :                  * In event of a race, worst case is we end up with an
     676                 :            :                  * unevictable page on [in]active list.
     677                 :            :                  * We know how to handle that.
     678                 :            :                  */
     679                 :            :                 is_unevictable = false;
     680                 :       1884 :                 lru_cache_add(page);
     681                 :            :         } else {
     682                 :            :                 /*
     683                 :            :                  * Put unevictable pages directly on zone's unevictable
     684                 :            :                  * list.
     685                 :            :                  */
     686                 :            :                 is_unevictable = true;
     687                 :       3782 :                 add_page_to_unevictable_list(page);
     688                 :            :                 /*
     689                 :            :                  * When racing with an mlock or AS_UNEVICTABLE clearing
     690                 :            :                  * (page is unlocked) make sure that if the other thread
     691                 :            :                  * does not observe our setting of PG_lru and fails
     692                 :            :                  * isolation/check_move_unevictable_pages,
     693                 :            :                  * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
     694                 :            :                  * the page back to the evictable list.
     695                 :            :                  *
     696                 :            :                  * The other side is TestClearPageMlocked() or shmem_lock().
     697                 :            :                  */
     698                 :       3782 :                 smp_mb();
     699                 :            :         }
     700                 :            : 
     701                 :            :         /*
     702                 :            :          * page's status can change while we move it among lru. If an evictable
     703                 :            :          * page is on unevictable list, it never be freed. To avoid that,
     704                 :            :          * check after we added it to the list, again.
     705                 :            :          */
     706 [ +  + ][ -  + ]:       5666 :         if (is_unevictable && page_evictable(page)) {
     707         [ #  # ]:          0 :                 if (!isolate_lru_page(page)) {
     708                 :          0 :                         put_page(page);
     709                 :          0 :                         goto redo;
     710                 :            :                 }
     711                 :            :                 /* This means someone else dropped this page from LRU
     712                 :            :                  * So, it will be freed or putback to LRU again. There is
     713                 :            :                  * nothing to do here.
     714                 :            :                  */
     715                 :            :         }
     716                 :            : 
     717         [ +  + ]:       5666 :         if (was_unevictable && !is_unevictable)
     718                 :            :                 count_vm_event(UNEVICTABLE_PGRESCUED);
     719         [ +  - ]:       3782 :         else if (!was_unevictable && is_unevictable)
     720                 :            :                 count_vm_event(UNEVICTABLE_PGCULLED);
     721                 :            : 
     722                 :       5666 :         put_page(page);         /* drop ref from isolate */
     723                 :       5666 : }
     724                 :            : 
     725                 :            : enum page_references {
     726                 :            :         PAGEREF_RECLAIM,
     727                 :            :         PAGEREF_RECLAIM_CLEAN,
     728                 :            :         PAGEREF_KEEP,
     729                 :            :         PAGEREF_ACTIVATE,
     730                 :            : };
     731                 :            : 
     732                 :     100668 : static enum page_references page_check_references(struct page *page,
     733                 :            :                                                   struct scan_control *sc)
     734                 :            : {
     735                 :            :         int referenced_ptes, referenced_page;
     736                 :            :         unsigned long vm_flags;
     737                 :            : 
     738                 :     100668 :         referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
     739                 :            :                                           &vm_flags);
     740                 :            :         referenced_page = TestClearPageReferenced(page);
     741                 :            : 
     742                 :            :         /*
     743                 :            :          * Mlock lost the isolation race with us.  Let try_to_unmap()
     744                 :            :          * move the page to the unevictable list.
     745                 :            :          */
     746            [ + ]:     100662 :         if (vm_flags & VM_LOCKED)
     747                 :            :                 return PAGEREF_RECLAIM;
     748                 :            : 
     749         [ +  + ]:     100665 :         if (referenced_ptes) {
     750         [ +  - ]:        754 :                 if (PageSwapBacked(page))
     751                 :            :                         return PAGEREF_ACTIVATE;
     752                 :            :                 /*
     753                 :            :                  * All mapped pages start out with page table
     754                 :            :                  * references from the instantiating fault, so we need
     755                 :            :                  * to look twice if a mapped file page is used more
     756                 :            :                  * than once.
     757                 :            :                  *
     758                 :            :                  * Mark it and spare it for another trip around the
     759                 :            :                  * inactive list.  Another page table reference will
     760                 :            :                  * lead to its activation.
     761                 :            :                  *
     762                 :            :                  * Note: the mark is set for activated pages as well
     763                 :            :                  * so that recently deactivated but used pages are
     764                 :            :                  * quickly recovered.
     765                 :            :                  */
     766                 :            :                 SetPageReferenced(page);
     767                 :            : 
     768         [ +  + ]:        754 :                 if (referenced_page || referenced_ptes > 1)
     769                 :            :                         return PAGEREF_ACTIVATE;
     770                 :            : 
     771                 :            :                 /*
     772                 :            :                  * Activate file-backed executable pages after first usage.
     773                 :            :                  */
     774         [ -  + ]:        419 :                 if (vm_flags & VM_EXEC)
     775                 :            :                         return PAGEREF_ACTIVATE;
     776                 :            : 
     777                 :            :                 return PAGEREF_KEEP;
     778                 :            :         }
     779                 :            : 
     780                 :            :         /* Reclaim if clean, defer dirty pages to writeback */
     781 [ +  + ][ -  + ]:      99911 :         if (referenced_page && !PageSwapBacked(page))
     782                 :            :                 return PAGEREF_RECLAIM_CLEAN;
     783                 :            : 
     784                 :            :         return PAGEREF_RECLAIM;
     785                 :            : }
     786                 :            : 
     787                 :            : /* Check if a page is dirty or under writeback */
     788                 :          0 : static void page_check_dirty_writeback(struct page *page,
     789                 :            :                                        bool *dirty, bool *writeback)
     790                 :            : {
     791                 :            :         struct address_space *mapping;
     792                 :            : 
     793                 :            :         /*
     794                 :            :          * Anonymous pages are not handled by flushers and must be written
     795                 :            :          * from reclaim context. Do not stall reclaim based on them
     796                 :            :          */
     797         [ -  + ]:     101827 :         if (!page_is_file_cache(page)) {
     798                 :          0 :                 *dirty = false;
     799                 :          0 :                 *writeback = false;
     800                 :          0 :                 return;
     801                 :            :         }
     802                 :            : 
     803                 :            :         /* By default assume that the page flags are accurate */
     804                 :     101827 :         *dirty = PageDirty(page);
     805                 :     101827 :         *writeback = PageWriteback(page);
     806                 :            : 
     807                 :            :         /* Verify dirty/writeback state if the filesystem supports it */
     808         [ +  + ]:     101827 :         if (!page_has_private(page))
     809                 :            :                 return;
     810                 :            : 
     811                 :      85652 :         mapping = page_mapping(page);
     812 [ +  + ][ +  + ]:      85653 :         if (mapping && mapping->a_ops->is_dirty_writeback)
     813                 :      51736 :                 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
     814                 :            : }
     815                 :            : 
     816                 :            : /*
     817                 :            :  * shrink_page_list() returns the number of reclaimed pages
     818                 :            :  */
     819                 :          0 : static unsigned long shrink_page_list(struct list_head *page_list,
     820                 :            :                                       struct zone *zone,
     821                 :     100667 :                                       struct scan_control *sc,
     822                 :            :                                       enum ttu_flags ttu_flags,
     823                 :            :                                       unsigned long *ret_nr_dirty,
     824                 :            :                                       unsigned long *ret_nr_unqueued_dirty,
     825                 :            :                                       unsigned long *ret_nr_congested,
     826                 :            :                                       unsigned long *ret_nr_writeback,
     827                 :            :                                       unsigned long *ret_nr_immediate,
     828                 :            :                                       bool force_reclaim)
     829                 :            : {
     830                 :       6922 :         LIST_HEAD(ret_pages);
     831                 :       6922 :         LIST_HEAD(free_pages);
     832                 :            :         int pgactivate = 0;
     833                 :            :         unsigned long nr_unqueued_dirty = 0;
     834                 :            :         unsigned long nr_dirty = 0;
     835                 :            :         unsigned long nr_congested = 0;
     836                 :            :         unsigned long nr_reclaimed = 0;
     837                 :            :         unsigned long nr_writeback = 0;
     838                 :            :         unsigned long nr_immediate = 0;
     839                 :            : 
     840                 :       6922 :         cond_resched();
     841                 :            : 
     842                 :            :         mem_cgroup_uncharge_start();
     843         [ +  + ]:     112592 :         while (!list_empty(page_list)) {
     844                 :            :                 struct address_space *mapping;
     845                 :     199802 :                 struct page *page;
     846                 :            :                 int may_enter_fs;
     847                 :            :                 enum page_references references = PAGEREF_RECLAIM_CLEAN;
     848                 :            :                 bool dirty, writeback;
     849                 :            : 
     850                 :     105671 :                 cond_resched();
     851                 :            : 
     852                 :     105674 :                 page = lru_to_page(page_list);
     853                 :            :                 list_del(&page->lru);
     854                 :            : 
     855         [ +  + ]:     105670 :                 if (!trylock_page(page))
     856                 :            :                         goto keep;
     857                 :            : 
     858                 :            :                 VM_BUG_ON_PAGE(PageActive(page), page);
     859                 :            :                 VM_BUG_ON_PAGE(page_zone(page) != zone, page);
     860                 :            : 
     861                 :     101823 :                 sc->nr_scanned++;
     862                 :            : 
     863         [ +  - ]:     101823 :                 if (unlikely(!page_evictable(page)))
     864                 :            :                         goto cull_mlocked;
     865                 :            : 
     866 [ -  + ][ #  # ]:     101828 :                 if (!sc->may_unmap && page_mapped(page))
     867                 :            :                         goto keep_locked;
     868                 :            : 
     869                 :            :                 /* Double the slab pressure for mapped and swapcache pages */
     870 [ +  + ][ -  + ]:     101828 :                 if (page_mapped(page) || PageSwapCache(page))
     871                 :       2273 :                         sc->nr_scanned++;
     872                 :            : 
     873 [ -  + ][ #  # ]:     101828 :                 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
     874         [ #  # ]:          0 :                         (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
     875                 :            : 
     876                 :            :                 /*
     877                 :            :                  * The number of dirty pages determines if a zone is marked
     878                 :            :                  * reclaim_congested which affects wait_iff_congested. kswapd
     879                 :            :                  * will stall and start writing pages if the tail of the LRU
     880                 :            :                  * is all dirty unqueued pages.
     881                 :            :                  */
     882                 :     101828 :                 page_check_dirty_writeback(page, &dirty, &writeback);
     883 [ +  + ][ +  + ]:     101827 :                 if (dirty || writeback)
     884                 :       1180 :                         nr_dirty++;
     885                 :            : 
     886 [ +  + ][ +  - ]:     101827 :                 if (dirty && !writeback)
     887                 :         20 :                         nr_unqueued_dirty++;
     888                 :            : 
     889                 :            :                 /*
     890                 :            :                  * Treat this page as congested if the underlying BDI is or if
     891                 :            :                  * pages are cycling through the LRU so quickly that the
     892                 :            :                  * pages marked for immediate reclaim are making it to the
     893                 :            :                  * end of the LRU a second time.
     894                 :            :                  */
     895                 :     101827 :                 mapping = page_mapping(page);
     896 [ +  + ][ +  - ]:     203655 :                 if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
                 [ +  + ]
     897         [ +  + ]:       1160 :                     (writeback && PageReclaim(page)))
     898                 :       1153 :                         nr_congested++;
     899                 :            : 
     900                 :            :                 /*
     901                 :            :                  * If a page at the tail of the LRU is under writeback, there
     902                 :            :                  * are three cases to consider.
     903                 :            :                  *
     904                 :            :                  * 1) If reclaim is encountering an excessive number of pages
     905                 :            :                  *    under writeback and this page is both under writeback and
     906                 :            :                  *    PageReclaim then it indicates that pages are being queued
     907                 :            :                  *    for IO but are being recycled through the LRU before the
     908                 :            :                  *    IO can complete. Waiting on the page itself risks an
     909                 :            :                  *    indefinite stall if it is impossible to writeback the
     910                 :            :                  *    page due to IO error or disconnected storage so instead
     911                 :            :                  *    note that the LRU is being scanned too quickly and the
     912                 :            :                  *    caller can stall after page list has been processed.
     913                 :            :                  *
     914                 :            :                  * 2) Global reclaim encounters a page, memcg encounters a
     915                 :            :                  *    page that is not marked for immediate reclaim or
     916                 :            :                  *    the caller does not have __GFP_IO. In this case mark
     917                 :            :                  *    the page for immediate reclaim and continue scanning.
     918                 :            :                  *
     919                 :            :                  *    __GFP_IO is checked  because a loop driver thread might
     920                 :            :                  *    enter reclaim, and deadlock if it waits on a page for
     921                 :            :                  *    which it is needed to do the write (loop masks off
     922                 :            :                  *    __GFP_IO|__GFP_FS for this reason); but more thought
     923                 :            :                  *    would probably show more reasons.
     924                 :            :                  *
     925                 :            :                  *    Don't require __GFP_FS, since we're not going into the
     926                 :            :                  *    FS, just waiting on its writeback completion. Worryingly,
     927                 :            :                  *    ext4 gfs2 and xfs allocate pages with
     928                 :            :                  *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
     929                 :            :                  *    may_enter_fs here is liable to OOM on them.
     930                 :            :                  *
     931                 :            :                  * 3) memcg encounters a page that is not already marked
     932                 :            :                  *    PageReclaim. memcg does not have any dirty pages
     933                 :            :                  *    throttling so we could easily OOM just because too many
     934                 :            :                  *    pages are in writeback and there is nothing else to
     935                 :            :                  *    reclaim. Wait for the writeback to complete.
     936                 :            :                  */
     937         [ +  + ]:     101828 :                 if (PageWriteback(page)) {
     938                 :            :                         /* Case 1 above */
     939 [ +  + ][ +  + ]:       1160 :                         if (current_is_kswapd() &&
     940         [ +  + ]:        175 :                             PageReclaim(page) &&
     941                 :            :                             zone_is_reclaim_writeback(zone)) {
     942                 :          8 :                                 nr_immediate++;
     943                 :          8 :                                 goto keep_locked;
     944                 :            : 
     945                 :            :                         /* Case 2 above */
     946                 :            :                         } else if (global_reclaim(sc) ||
     947                 :            :                             !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
     948                 :            :                                 /*
     949                 :            :                                  * This is slightly racy - end_page_writeback()
     950                 :            :                                  * might have just cleared PageReclaim, then
     951                 :            :                                  * setting PageReclaim here end up interpreted
     952                 :            :                                  * as PageReadahead - but that does not matter
     953                 :            :                                  * enough to care.  What we do want is for this
     954                 :            :                                  * page to have PageReclaim set next time memcg
     955                 :            :                                  * reclaim reaches the tests above, so it will
     956                 :            :                                  * then wait_on_page_writeback() to avoid OOM;
     957                 :            :                                  * and it's also appropriate in global reclaim.
     958                 :            :                                  */
     959                 :            :                                 SetPageReclaim(page);
     960                 :       1152 :                                 nr_writeback++;
     961                 :            : 
     962                 :       1152 :                                 goto keep_locked;
     963                 :            : 
     964                 :            :                         /* Case 3 above */
     965                 :            :                         } else {
     966                 :            :                                 wait_on_page_writeback(page);
     967                 :            :                         }
     968                 :            :                 }
     969                 :            : 
     970         [ +  + ]:     100668 :                 if (!force_reclaim)
     971                 :     100667 :                         references = page_check_references(page, sc);
     972                 :            : 
     973      [ +  +  - ]:     100664 :                 switch (references) {
     974                 :            :                 case PAGEREF_ACTIVATE:
     975                 :            :                         goto activate_locked;
     976                 :            :                 case PAGEREF_KEEP:
     977                 :            :                         goto keep_locked;
     978                 :            :                 case PAGEREF_RECLAIM:
     979                 :            :                 case PAGEREF_RECLAIM_CLEAN:
     980                 :            :                         ; /* try to reclaim the page below */
     981                 :            :                 }
     982                 :            : 
     983                 :            :                 /*
     984                 :            :                  * Anonymous process memory has backing store?
     985                 :            :                  * Try to allocate it some swap space here.
     986                 :            :                  */
     987 [ -  + ][ #  # ]:      99910 :                 if (PageAnon(page) && !PageSwapCache(page)) {
     988         [ #  # ]:          0 :                         if (!(sc->gfp_mask & __GFP_IO))
     989                 :            :                                 goto keep_locked;
     990         [ #  # ]:          0 :                         if (!add_to_swap(page, page_list))
     991                 :            :                                 goto activate_locked;
     992                 :            :                         may_enter_fs = 1;
     993                 :            : 
     994                 :            :                         /* Adding to swap updated mapping */
     995                 :          0 :                         mapping = page_mapping(page);
     996                 :            :                 }
     997                 :            : 
     998                 :            :                 /*
     999                 :            :                  * The page is mapped into the page tables of one or more
    1000                 :            :                  * processes. Try to unmap it here.
    1001                 :            :                  */
    1002 [ +  + ][ +  - ]:     106834 :                 if (page_mapped(page) && mapping) {
    1003   [ +  -  -  - ]:       1519 :                         switch (try_to_unmap(page, ttu_flags)) {
    1004                 :            :                         case SWAP_FAIL:
    1005                 :            :                                 goto activate_locked;
    1006                 :            :                         case SWAP_AGAIN:
    1007                 :            :                                 goto keep_locked;
    1008                 :            :                         case SWAP_MLOCK:
    1009                 :            :                                 goto cull_mlocked;
    1010                 :            :                         case SWAP_SUCCESS:
    1011                 :            :                                 ; /* try to free the page below */
    1012                 :            :                         }
    1013                 :            :                 }
    1014                 :            : 
    1015         [ +  + ]:      99912 :                 if (PageDirty(page)) {
    1016                 :            :                         /*
    1017                 :            :                          * Only kswapd can writeback filesystem pages to
    1018                 :            :                          * avoid risk of stack overflow but only writeback
    1019                 :            :                          * if many dirty pages have been encountered.
    1020                 :            :                          */
    1021 [ +  - ][ +  - ]:         20 :                         if (page_is_file_cache(page) &&
    1022         [ +  + ]:         20 :                                         (!current_is_kswapd() ||
    1023                 :            :                                          !zone_is_reclaim_dirty(zone))) {
    1024                 :            :                                 /*
    1025                 :            :                                  * Immediately reclaim when written back.
    1026                 :            :                                  * Similar in principal to deactivate_page()
    1027                 :            :                                  * except we already have the page isolated
    1028                 :            :                                  * and know it's dirty
    1029                 :            :                                  */
    1030                 :         13 :                                 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
    1031                 :            :                                 SetPageReclaim(page);
    1032                 :            : 
    1033                 :            :                                 goto keep_locked;
    1034                 :            :                         }
    1035                 :            : 
    1036         [ +  + ]:          7 :                         if (references == PAGEREF_RECLAIM_CLEAN)
    1037                 :            :                                 goto keep_locked;
    1038         [ +  - ]:          4 :                         if (!may_enter_fs)
    1039                 :            :                                 goto keep_locked;
    1040         [ +  - ]:          4 :                         if (!sc->may_writepage)
    1041                 :            :                                 goto keep_locked;
    1042                 :            : 
    1043                 :            :                         /* Page is dirty, try to write it out here */
    1044   [ +  -  -  - ]:          4 :                         switch (pageout(page, mapping, sc)) {
    1045                 :            :                         case PAGE_KEEP:
    1046                 :            :                                 goto keep_locked;
    1047                 :            :                         case PAGE_ACTIVATE:
    1048                 :            :                                 goto activate_locked;
    1049                 :            :                         case PAGE_SUCCESS:
    1050         [ +  - ]:          4 :                                 if (PageWriteback(page))
    1051                 :            :                                         goto keep;
    1052         [ -  + ]:          4 :                                 if (PageDirty(page))
    1053                 :            :                                         goto keep;
    1054                 :            : 
    1055                 :            :                                 /*
    1056                 :            :                                  * A synchronous write - probably a ramdisk.  Go
    1057                 :            :                                  * ahead and try to reclaim the page.
    1058                 :            :                                  */
    1059         [ #  # ]:          0 :                                 if (!trylock_page(page))
    1060                 :            :                                         goto keep;
    1061 [ #  # ][ #  # ]:          0 :                                 if (PageDirty(page) || PageWriteback(page))
    1062                 :            :                                         goto keep_locked;
    1063                 :          0 :                                 mapping = page_mapping(page);
    1064                 :            :                         case PAGE_CLEAN:
    1065                 :            :                                 ; /* try to free the page below */
    1066                 :            :                         }
    1067                 :            :                 }
    1068                 :            : 
    1069                 :            :                 /*
    1070                 :            :                  * If the page has buffers, try to free the buffer mappings
    1071                 :            :                  * associated with this page. If we succeed we try to free
    1072                 :            :                  * the page as well.
    1073                 :            :                  *
    1074                 :            :                  * We do this even if the page is PageDirty().
    1075                 :            :                  * try_to_release_page() does not perform I/O, but it is
    1076                 :            :                  * possible for a page to have PageDirty set, but it is actually
    1077                 :            :                  * clean (all its buffers are clean).  This happens if the
    1078                 :            :                  * buffers were written out directly, with submit_bh(). ext3
    1079                 :            :                  * will do this, as well as the blockdev mapping.
    1080                 :            :                  * try_to_release_page() will discover that cleanness and will
    1081                 :            :                  * drop the buffers and mark the page clean - it can be freed.
    1082                 :            :                  *
    1083                 :            :                  * Rarely, pages can have buffers and no ->mapping.  These are
    1084                 :            :                  * the pages which were not successfully invalidated in
    1085                 :            :                  * truncate_complete_page().  We try to drop those buffers here
    1086                 :            :                  * and if that worked, and the page is no longer mapped into
    1087                 :            :                  * process address space (page_count == 1) it can be freed.
    1088                 :            :                  * Otherwise, leave the page on the LRU so it is swappable.
    1089                 :            :                  */
    1090         [ +  + ]:      99892 :                 if (page_has_private(page)) {
    1091         [ +  + ]:      84471 :                         if (!try_to_release_page(page, sc->gfp_mask))
    1092                 :            :                                 goto activate_locked;
    1093 [ +  + ][ +  - ]:      67564 :                         if (!mapping && page_count(page) == 1) {
    1094                 :          1 :                                 unlock_page(page);
    1095         [ -  + ]:          1 :                                 if (put_page_testzero(page))
    1096                 :            :                                         goto free_it;
    1097                 :            :                                 else {
    1098                 :            :                                         /*
    1099                 :            :                                          * rare race with speculative reference.
    1100                 :            :                                          * the speculative reference will free
    1101                 :            :                                          * this page shortly, so we may
    1102                 :            :                                          * increment nr_reclaimed here (and
    1103                 :            :                                          * leave it off the LRU).
    1104                 :            :                                          */
    1105                 :          0 :                                         nr_reclaimed++;
    1106                 :     105671 :                                         continue;
    1107                 :            :                                 }
    1108                 :            :                         }
    1109                 :            :                 }
    1110                 :            : 
    1111 [ +  - ][ +  + ]:      82983 :                 if (!mapping || !__remove_mapping(mapping, page))
    1112                 :            :                         goto keep_locked;
    1113                 :            : 
    1114                 :            :                 /*
    1115                 :            :                  * At this point, we have no other references and there is
    1116                 :            :                  * no way to pick any more up (removed from LRU, removed
    1117                 :            :                  * from pagecache). Can use non-atomic bitops now (and
    1118                 :            :                  * we obviously don't have to worry about waking up a process
    1119                 :            :                  * waiting on the page lock, because there are no references.
    1120                 :            :                  */
    1121                 :            :                 __clear_page_locked(page);
    1122                 :            : free_it:
    1123                 :      82884 :                 nr_reclaimed++;
    1124                 :            : 
    1125                 :            :                 /*
    1126                 :            :                  * Is there need to periodically free_page_list? It would
    1127                 :            :                  * appear not as the counts should be low
    1128                 :            :                  */
    1129                 :      82884 :                 list_add(&page->lru, &free_pages);
    1130                 :      82884 :                 continue;
    1131                 :            : 
    1132                 :            : cull_mlocked:
    1133         [ #  # ]:          0 :                 if (PageSwapCache(page))
    1134                 :          0 :                         try_to_free_swap(page);
    1135                 :          0 :                 unlock_page(page);
    1136                 :          0 :                 putback_lru_page(page);
    1137                 :          0 :                 continue;
    1138                 :            : 
    1139                 :            : activate_locked:
    1140                 :            :                 /* Not a candidate for swapping, so reclaim swap space. */
    1141 [ -  + ][ #  # ]:      17664 :                 if (PageSwapCache(page) && vm_swap_full())
    1142                 :          0 :                         try_to_free_swap(page);
    1143                 :            :                 VM_BUG_ON_PAGE(PageActive(page), page);
    1144                 :            :                 SetPageActive(page);
    1145                 :      17664 :                 pgactivate++;
    1146                 :            : keep_locked:
    1147                 :      18940 :                 unlock_page(page);
    1148                 :            : keep:
    1149                 :      22787 :                 list_add(&page->lru, &ret_pages);
    1150                 :            :                 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
    1151                 :            :         }
    1152                 :            : 
    1153                 :       6921 :         free_hot_cold_page_list(&free_pages, 1);
    1154                 :            : 
    1155                 :            :         list_splice(&ret_pages, page_list);
    1156                 :            :         count_vm_events(PGACTIVATE, pgactivate);
    1157                 :            :         mem_cgroup_uncharge_end();
    1158                 :       6921 :         *ret_nr_dirty += nr_dirty;
    1159                 :       6921 :         *ret_nr_congested += nr_congested;
    1160                 :       6921 :         *ret_nr_unqueued_dirty += nr_unqueued_dirty;
    1161                 :       6921 :         *ret_nr_writeback += nr_writeback;
    1162                 :       6921 :         *ret_nr_immediate += nr_immediate;
    1163                 :       6921 :         return nr_reclaimed;
    1164                 :            : }
    1165                 :            : 
    1166                 :          0 : unsigned long reclaim_clean_pages_from_list(struct zone *zone,
    1167                 :            :                                             struct list_head *page_list)
    1168                 :            : {
    1169                 :          0 :         struct scan_control sc = {
    1170                 :            :                 .gfp_mask = GFP_KERNEL,
    1171                 :            :                 .priority = DEF_PRIORITY,
    1172                 :            :                 .may_unmap = 1,
    1173                 :            :         };
    1174                 :            :         unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
    1175                 :            :         struct page *page, *next;
    1176                 :          0 :         LIST_HEAD(clean_pages);
    1177                 :            : 
    1178         [ #  # ]:          0 :         list_for_each_entry_safe(page, next, page_list, lru) {
    1179 [ #  # ][ #  # ]:          0 :                 if (page_is_file_cache(page) && !PageDirty(page) &&
    1180                 :            :                     !isolated_balloon_page(page)) {
    1181                 :            :                         ClearPageActive(page);
    1182                 :            :                         list_move(&page->lru, &clean_pages);
    1183                 :            :                 }
    1184                 :            :         }
    1185                 :            : 
    1186                 :          0 :         ret = shrink_page_list(&clean_pages, zone, &sc,
    1187                 :            :                         TTU_UNMAP|TTU_IGNORE_ACCESS,
    1188                 :            :                         &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
    1189                 :            :         list_splice(&clean_pages, page_list);
    1190                 :          0 :         __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
    1191                 :          0 :         return ret;
    1192                 :            : }
    1193                 :            : 
    1194                 :            : /*
    1195                 :            :  * Attempt to remove the specified page from its LRU.  Only take this page
    1196                 :            :  * if it is of the appropriate PageActive status.  Pages which are being
    1197                 :            :  * freed elsewhere are also ignored.
    1198                 :            :  *
    1199                 :            :  * page:        page to consider
    1200                 :            :  * mode:        one of the LRU isolation modes defined above
    1201                 :            :  *
    1202                 :            :  * returns 0 on success, -ve errno on failure.
    1203                 :            :  */
    1204                 :          0 : int __isolate_lru_page(struct page *page, isolate_mode_t mode)
    1205                 :            : {
    1206                 :            :         int ret = -EINVAL;
    1207                 :            : 
    1208                 :            :         /* Only take pages on the LRU. */
    1209         [ +  - ]:     152290 :         if (!PageLRU(page))
    1210                 :            :                 return ret;
    1211                 :            : 
    1212                 :            :         /* Compaction should not handle unevictable pages but CMA can do so */
    1213 [ -  + ][ #  # ]:     152290 :         if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
    1214                 :            :                 return ret;
    1215                 :            : 
    1216                 :            :         ret = -EBUSY;
    1217                 :            : 
    1218                 :            :         /*
    1219                 :            :          * To minimise LRU disruption, the caller can indicate that it only
    1220                 :            :          * wants to isolate pages it will be able to operate on without
    1221                 :            :          * blocking - clean pages for the most part.
    1222                 :            :          *
    1223                 :            :          * ISOLATE_CLEAN means that only clean pages should be isolated. This
    1224                 :            :          * is used by reclaim when it is cannot write to backing storage
    1225                 :            :          *
    1226                 :            :          * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
    1227                 :            :          * that it is possible to migrate without blocking
    1228                 :            :          */
    1229         [ -  + ]:     152290 :         if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
    1230                 :            :                 /* All the caller can do on PageWriteback is block */
    1231         [ #  # ]:          0 :                 if (PageWriteback(page))
    1232                 :            :                         return ret;
    1233                 :            : 
    1234         [ #  # ]:          0 :                 if (PageDirty(page)) {
    1235                 :            :                         struct address_space *mapping;
    1236                 :            : 
    1237                 :            :                         /* ISOLATE_CLEAN means only clean pages */
    1238         [ #  # ]:          0 :                         if (mode & ISOLATE_CLEAN)
    1239                 :            :                                 return ret;
    1240                 :            : 
    1241                 :            :                         /*
    1242                 :            :                          * Only pages without mappings or that have a
    1243                 :            :                          * ->migratepage callback are possible to migrate
    1244                 :            :                          * without blocking
    1245                 :            :                          */
    1246                 :          0 :                         mapping = page_mapping(page);
    1247 [ #  # ][ #  # ]:          0 :                         if (mapping && !mapping->a_ops->migratepage)
    1248                 :            :                                 return ret;
    1249                 :            :                 }
    1250                 :            :         }
    1251                 :            : 
    1252 [ -  + ][ #  # ]:     152290 :         if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
    1253                 :            :                 return ret;
    1254                 :            : 
    1255         [ +  - ]:     152290 :         if (likely(get_page_unless_zero(page))) {
    1256                 :            :                 /*
    1257                 :            :                  * Be careful not to clear PageLRU until after we're
    1258                 :            :                  * sure the page is not being freed elsewhere -- the
    1259                 :            :                  * page release code relies on it.
    1260                 :            :                  */
    1261                 :            :                 ClearPageLRU(page);
    1262                 :            :                 ret = 0;
    1263                 :            :         }
    1264                 :            : 
    1265                 :     152290 :         return ret;
    1266                 :            : }
    1267                 :            : 
    1268                 :            : /*
    1269                 :            :  * zone->lru_lock is heavily contended.  Some of the functions that
    1270                 :            :  * shrink the lists perform better by taking out a batch of pages
    1271                 :            :  * and working on them outside the LRU lock.
    1272                 :            :  *
    1273                 :            :  * For pagecache intensive workloads, this function is the hottest
    1274                 :            :  * spot in the kernel (apart from copy_*_user functions).
    1275                 :            :  *
    1276                 :            :  * Appropriate locks must be held before calling this function.
    1277                 :            :  *
    1278                 :            :  * @nr_to_scan: The number of pages to look through on the list.
    1279                 :            :  * @lruvec:     The LRU vector to pull pages from.
    1280                 :            :  * @dst:        The temp list to put pages on to.
    1281                 :            :  * @nr_scanned: The number of pages that were scanned.
    1282                 :            :  * @sc:         The scan_control struct for this reclaim session
    1283                 :            :  * @mode:       One of the LRU isolation modes
    1284                 :            :  * @lru:        LRU list id for isolating
    1285                 :            :  *
    1286                 :            :  * returns how many pages were moved onto *@dst.
    1287                 :            :  */
    1288                 :          0 : static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
    1289                 :            :                 struct lruvec *lruvec, struct list_head *dst,
    1290                 :            :                 unsigned long *nr_scanned, struct scan_control *sc,
    1291                 :            :                 isolate_mode_t mode, enum lru_list lru)
    1292                 :            : {
    1293                 :      52613 :         struct list_head *src = &lruvec->lists[lru];
    1294                 :            :         unsigned long nr_taken = 0;
    1295                 :            :         unsigned long scan;
    1296                 :            : 
    1297 [ +  + ][ +  + ]:     204903 :         for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
    1298                 :            :                 struct page *page;
    1299                 :            :                 int nr_pages;
    1300                 :            : 
    1301                 :     152290 :                 page = lru_to_page(src);
    1302         [ +  + ]:     152290 :                 prefetchw_prev_lru_page(page, src, flags);
    1303                 :            : 
    1304                 :            :                 VM_BUG_ON_PAGE(!PageLRU(page), page);
    1305                 :            : 
    1306      [ +  -  - ]:     152290 :                 switch (__isolate_lru_page(page, mode)) {
    1307                 :            :                 case 0:
    1308                 :            :                         nr_pages = hpage_nr_pages(page);
    1309                 :            :                         mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
    1310                 :     152290 :                         list_move(&page->lru, dst);
    1311                 :     152290 :                         nr_taken += nr_pages;
    1312                 :            :                         break;
    1313                 :            : 
    1314                 :            :                 case -EBUSY:
    1315                 :            :                         /* else it is being freed elsewhere */
    1316                 :          0 :                         list_move(&page->lru, src);
    1317                 :          0 :                         continue;
    1318                 :            : 
    1319                 :            :                 default:
    1320                 :          0 :                         BUG();
    1321                 :            :                 }
    1322                 :            :         }
    1323                 :            : 
    1324                 :      52613 :         *nr_scanned = scan;
    1325                 :      52613 :         trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
    1326                 :            :                                     nr_taken, mode, is_file_lru(lru));
    1327                 :          0 :         return nr_taken;
    1328                 :            : }
    1329                 :            : 
    1330                 :            : /**
    1331                 :            :  * isolate_lru_page - tries to isolate a page from its LRU list
    1332                 :            :  * @page: page to isolate from its LRU list
    1333                 :            :  *
    1334                 :            :  * Isolates a @page from an LRU list, clears PageLRU and adjusts the
    1335                 :            :  * vmstat statistic corresponding to whatever LRU list the page was on.
    1336                 :            :  *
    1337                 :            :  * Returns 0 if the page was removed from an LRU list.
    1338                 :            :  * Returns -EBUSY if the page was not on an LRU list.
    1339                 :            :  *
    1340                 :            :  * The returned page will have PageLRU() cleared.  If it was found on
    1341                 :            :  * the active list, it will have PageActive set.  If it was found on
    1342                 :            :  * the unevictable list, it will have the PageUnevictable bit set. That flag
    1343                 :            :  * may need to be cleared by the caller before letting the page go.
    1344                 :            :  *
    1345                 :            :  * The vmstat statistic corresponding to the list on which the page was
    1346                 :            :  * found will be decremented.
    1347                 :            :  *
    1348                 :            :  * Restrictions:
    1349                 :            :  * (1) Must be called with an elevated refcount on the page. This is a
    1350                 :            :  *     fundamentnal difference from isolate_lru_pages (which is called
    1351                 :            :  *     without a stable reference).
    1352                 :            :  * (2) the lru_lock must not be held.
    1353                 :            :  * (3) interrupts must be enabled.
    1354                 :            :  */
    1355                 :          0 : int isolate_lru_page(struct page *page)
    1356                 :            : {
    1357                 :            :         int ret = -EBUSY;
    1358                 :            : 
    1359                 :            :         VM_BUG_ON_PAGE(!page_count(page), page);
    1360                 :            : 
    1361         [ +  - ]:       3783 :         if (PageLRU(page)) {
    1362                 :       3783 :                 struct zone *zone = page_zone(page);
    1363                 :            :                 struct lruvec *lruvec;
    1364                 :            : 
    1365                 :            :                 spin_lock_irq(&zone->lru_lock);
    1366                 :            :                 lruvec = mem_cgroup_page_lruvec(page, zone);
    1367         [ +  - ]:       3783 :                 if (PageLRU(page)) {
    1368                 :            :                         int lru = page_lru(page);
    1369                 :            :                         get_page(page);
    1370                 :            :                         ClearPageLRU(page);
    1371                 :            :                         del_page_from_lru_list(page, lruvec, lru);
    1372                 :            :                         ret = 0;
    1373                 :            :                 }
    1374                 :            :                 spin_unlock_irq(&zone->lru_lock);
    1375                 :            :         }
    1376                 :       3783 :         return ret;
    1377                 :            : }
    1378                 :            : 
    1379                 :            : /*
    1380                 :            :  * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
    1381                 :            :  * then get resheduled. When there are massive number of tasks doing page
    1382                 :            :  * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
    1383                 :            :  * the LRU list will go small and be scanned faster than necessary, leading to
    1384                 :            :  * unnecessary swapping, thrashing and OOM.
    1385                 :            :  */
    1386                 :      36028 : static int too_many_isolated(struct zone *zone, int file,
    1387                 :            :                 struct scan_control *sc)
    1388                 :            : {
    1389                 :            :         unsigned long inactive, isolated;
    1390                 :            : 
    1391            [ + ]:      36028 :         if (current_is_kswapd())
    1392                 :            :                 return 0;
    1393                 :            : 
    1394                 :            :         if (!global_reclaim(sc))
    1395                 :            :                 return 0;
    1396                 :            : 
    1397         [ +  - ]:      59294 :         if (file) {
    1398                 :            :                 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
    1399                 :            :                 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
    1400                 :            :         } else {
    1401                 :            :                 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
    1402                 :            :                 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
    1403                 :            :         }
    1404                 :            : 
    1405                 :            :         /*
    1406                 :            :          * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
    1407                 :            :          * won't get blocked by normal direct-reclaimers, forming a circular
    1408                 :            :          * deadlock.
    1409                 :            :          */
    1410         [ +  - ]:      23266 :         if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
    1411                 :      23266 :                 inactive >>= 3;
    1412                 :            : 
    1413                 :      23266 :         return isolated > inactive;
    1414                 :            : }
    1415                 :            : 
    1416                 :            : static noinline_for_stack void
    1417                 :          0 : putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
    1418                 :            : {
    1419                 :            :         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
    1420                 :            :         struct zone *zone = lruvec_zone(lruvec);
    1421                 :       6922 :         LIST_HEAD(pages_to_free);
    1422                 :            : 
    1423                 :            :         /*
    1424                 :            :          * Put back any unfreeable pages.
    1425                 :            :          */
    1426         [ +  + ]:      29712 :         while (!list_empty(page_list)) {
    1427                 :      22790 :                 struct page *page = lru_to_page(page_list);
    1428                 :            :                 int lru;
    1429                 :            : 
    1430                 :            :                 VM_BUG_ON_PAGE(PageLRU(page), page);
    1431                 :            :                 list_del(&page->lru);
    1432         [ -  + ]:      22790 :                 if (unlikely(!page_evictable(page))) {
    1433                 :            :                         spin_unlock_irq(&zone->lru_lock);
    1434                 :          0 :                         putback_lru_page(page);
    1435                 :            :                         spin_lock_irq(&zone->lru_lock);
    1436                 :          0 :                         continue;
    1437                 :            :                 }
    1438                 :            : 
    1439                 :            :                 lruvec = mem_cgroup_page_lruvec(page, zone);
    1440                 :            : 
    1441                 :            :                 SetPageLRU(page);
    1442                 :            :                 lru = page_lru(page);
    1443                 :            :                 add_page_to_lru_list(page, lruvec, lru);
    1444                 :            : 
    1445         [ +  + ]:      22790 :                 if (is_active_lru(lru)) {
    1446                 :            :                         int file = is_file_lru(lru);
    1447                 :            :                         int numpages = hpage_nr_pages(page);
    1448                 :      17664 :                         reclaim_stat->recent_rotated[file] += numpages;
    1449                 :            :                 }
    1450         [ -  + ]:      22790 :                 if (put_page_testzero(page)) {
    1451                 :            :                         __ClearPageLRU(page);
    1452                 :            :                         __ClearPageActive(page);
    1453                 :            :                         del_page_from_lru_list(page, lruvec, lru);
    1454                 :            : 
    1455         [ #  # ]:          0 :                         if (unlikely(PageCompound(page))) {
    1456                 :            :                                 spin_unlock_irq(&zone->lru_lock);
    1457                 :          0 :                                 (*get_compound_page_dtor(page))(page);
    1458                 :            :                                 spin_lock_irq(&zone->lru_lock);
    1459                 :            :                         } else
    1460                 :            :                                 list_add(&page->lru, &pages_to_free);
    1461                 :            :                 }
    1462                 :            :         }
    1463                 :            : 
    1464                 :            :         /*
    1465                 :            :          * To save our caller's stack, now use input list for pages to free.
    1466                 :            :          */
    1467                 :            :         list_splice(&pages_to_free, page_list);
    1468                 :       6922 : }
    1469                 :            : 
    1470                 :            : /*
    1471                 :            :  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
    1472                 :            :  * of reclaimed pages
    1473                 :            :  */
    1474                 :            : static noinline_for_stack unsigned long
    1475                 :          0 : shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
    1476                 :            :                      struct scan_control *sc, enum lru_list lru)
    1477                 :            : {
    1478                 :      35839 :         LIST_HEAD(page_list);
    1479                 :            :         unsigned long nr_scanned;
    1480                 :            :         unsigned long nr_reclaimed = 0;
    1481                 :            :         unsigned long nr_taken;
    1482                 :      35839 :         unsigned long nr_dirty = 0;
    1483                 :      35839 :         unsigned long nr_congested = 0;
    1484                 :      35839 :         unsigned long nr_unqueued_dirty = 0;
    1485                 :      35839 :         unsigned long nr_writeback = 0;
    1486                 :      35839 :         unsigned long nr_immediate = 0;
    1487                 :            :         isolate_mode_t isolate_mode = 0;
    1488                 :            :         int file = is_file_lru(lru);
    1489                 :      35839 :         struct zone *zone = lruvec_zone(lruvec);
    1490                 :            :         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
    1491                 :            : 
    1492         [ +  + ]:      36029 :         while (unlikely(too_many_isolated(zone, file, sc))) {
    1493                 :        190 :                 congestion_wait(BLK_RW_ASYNC, HZ/10);
    1494                 :            : 
    1495                 :            :                 /* We are about to die and free our memory. Return now. */
    1496         [ +  - ]:        190 :                 if (fatal_signal_pending(current))
    1497                 :            :                         return SWAP_CLUSTER_MAX;
    1498                 :            :         }
    1499                 :            : 
    1500                 :      35840 :         lru_add_drain();
    1501                 :            : 
    1502         [ -  + ]:      35840 :         if (!sc->may_unmap)
    1503                 :            :                 isolate_mode |= ISOLATE_UNMAPPED;
    1504         [ -  + ]:      35840 :         if (!sc->may_writepage)
    1505                 :          0 :                 isolate_mode |= ISOLATE_CLEAN;
    1506                 :            : 
    1507                 :            :         spin_lock_irq(&zone->lru_lock);
    1508                 :            : 
    1509                 :      35840 :         nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
    1510                 :            :                                      &nr_scanned, sc, isolate_mode, lru);
    1511                 :            : 
    1512                 :      35840 :         __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
    1513                 :      35840 :         __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
    1514                 :            : 
    1515                 :            :         if (global_reclaim(sc)) {
    1516                 :      35840 :                 zone->pages_scanned += nr_scanned;
    1517         [ +  + ]:      35840 :                 if (current_is_kswapd())
    1518                 :      12762 :                         __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
    1519                 :            :                 else
    1520                 :      23078 :                         __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
    1521                 :            :         }
    1522                 :            :         spin_unlock_irq(&zone->lru_lock);
    1523                 :            : 
    1524         [ +  + ]:      35840 :         if (nr_taken == 0)
    1525                 :            :                 return 0;
    1526                 :            : 
    1527                 :       6922 :         nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
    1528                 :            :                                 &nr_dirty, &nr_unqueued_dirty, &nr_congested,
    1529                 :            :                                 &nr_writeback, &nr_immediate,
    1530                 :            :                                 false);
    1531                 :            : 
    1532                 :            :         spin_lock_irq(&zone->lru_lock);
    1533                 :            : 
    1534                 :       6922 :         reclaim_stat->recent_scanned[file] += nr_taken;
    1535                 :            : 
    1536                 :            :         if (global_reclaim(sc)) {
    1537         [ +  + ]:       6922 :                 if (current_is_kswapd())
    1538                 :       5376 :                         __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
    1539                 :            :                                                nr_reclaimed);
    1540                 :            :                 else
    1541                 :       1546 :                         __count_zone_vm_events(PGSTEAL_DIRECT, zone,
    1542                 :            :                                                nr_reclaimed);
    1543                 :            :         }
    1544                 :            : 
    1545                 :       6922 :         putback_inactive_pages(lruvec, &page_list);
    1546                 :            : 
    1547                 :       6922 :         __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
    1548                 :            : 
    1549                 :            :         spin_unlock_irq(&zone->lru_lock);
    1550                 :            : 
    1551                 :       6922 :         free_hot_cold_page_list(&page_list, 1);
    1552                 :            : 
    1553                 :            :         /*
    1554                 :            :          * If reclaim is isolating dirty pages under writeback, it implies
    1555                 :            :          * that the long-lived page allocation rate is exceeding the page
    1556                 :            :          * laundering rate. Either the global limits are not being effective
    1557                 :            :          * at throttling processes due to the page distribution throughout
    1558                 :            :          * zones or there is heavy usage of a slow backing device. The
    1559                 :            :          * only option is to throttle from reclaim context which is not ideal
    1560                 :            :          * as there is no guarantee the dirtying process is throttled in the
    1561                 :            :          * same way balance_dirty_pages() manages.
    1562                 :            :          *
    1563                 :            :          * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
    1564                 :            :          * of pages under pages flagged for immediate reclaim and stall if any
    1565                 :            :          * are encountered in the nr_immediate check below.
    1566                 :            :          */
    1567 [ +  + ][ +  + ]:       6922 :         if (nr_writeback && nr_writeback == nr_taken)
    1568                 :            :                 zone_set_flag(zone, ZONE_WRITEBACK);
    1569                 :            : 
    1570                 :            :         /*
    1571                 :            :          * memcg will stall in page writeback so only consider forcibly
    1572                 :            :          * stalling for global reclaim
    1573                 :            :          */
    1574                 :            :         if (global_reclaim(sc)) {
    1575                 :            :                 /*
    1576                 :            :                  * Tag a zone as congested if all the dirty pages scanned were
    1577                 :            :                  * backed by a congested BDI and wait_iff_congested will stall.
    1578                 :            :                  */
    1579 [ +  + ][ +  + ]:       6922 :                 if (nr_dirty && nr_dirty == nr_congested)
    1580                 :            :                         zone_set_flag(zone, ZONE_CONGESTED);
    1581                 :            : 
    1582                 :            :                 /*
    1583                 :            :                  * If dirty pages are scanned that are not queued for IO, it
    1584                 :            :                  * implies that flushers are not keeping up. In this case, flag
    1585                 :            :                  * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
    1586                 :            :                  * pages from reclaim context. It will forcibly stall in the
    1587                 :            :                  * next check.
    1588                 :            :                  */
    1589         [ +  + ]:       6922 :                 if (nr_unqueued_dirty == nr_taken)
    1590                 :            :                         zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
    1591                 :            : 
    1592                 :            :                 /*
    1593                 :            :                  * In addition, if kswapd scans pages marked marked for
    1594                 :            :                  * immediate reclaim and under writeback (nr_immediate), it
    1595                 :            :                  * implies that pages are cycling through the LRU faster than
    1596                 :            :                  * they are written so also forcibly stall.
    1597                 :            :                  */
    1598    [ +  + ][ + ]:       6922 :                 if (nr_unqueued_dirty == nr_taken || nr_immediate)
    1599                 :          0 :                         congestion_wait(BLK_RW_ASYNC, HZ/10);
    1600                 :            :         }
    1601                 :            : 
    1602                 :            :         /*
    1603                 :            :          * Stall direct reclaim for IO completions if underlying BDIs or zone
    1604                 :            :          * is congested. Allow kswapd to continue until it starts encountering
    1605                 :            :          * unqueued dirty pages or cycling through the LRU too quickly.
    1606                 :            :          */
    1607 [ +  - ][ +  + ]:      42761 :         if (!sc->hibernation_mode && !current_is_kswapd())
    1608                 :       1546 :                 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
    1609                 :            : 
    1610         [ -  + ]:       6922 :         trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
    1611                 :       6922 :                 zone_idx(zone),
    1612                 :            :                 nr_scanned, nr_reclaimed,
    1613                 :            :                 sc->priority,
    1614                 :            :                 trace_shrink_flags(file));
    1615                 :       6922 :         return nr_reclaimed;
    1616                 :            : }
    1617                 :            : 
    1618                 :            : /*
    1619                 :            :  * This moves pages from the active list to the inactive list.
    1620                 :            :  *
    1621                 :            :  * We move them the other way if the page is referenced by one or more
    1622                 :            :  * processes, from rmap.
    1623                 :            :  *
    1624                 :            :  * If the pages are mostly unmapped, the processing is fast and it is
    1625                 :            :  * appropriate to hold zone->lru_lock across the whole operation.  But if
    1626                 :            :  * the pages are mapped, the processing is slow (page_referenced()) so we
    1627                 :            :  * should drop zone->lru_lock around each page.  It's impossible to balance
    1628                 :            :  * this, so instead we remove the pages from the LRU while processing them.
    1629                 :            :  * It is safe to rely on PG_active against the non-LRU pages in here because
    1630                 :            :  * nobody will play with that bit on a non-LRU page.
    1631                 :            :  *
    1632                 :            :  * The downside is that we have to touch page->_count against each page.
    1633                 :            :  * But we had to alter page->flags anyway.
    1634                 :            :  */
    1635                 :            : 
    1636                 :          0 : static void move_active_pages_to_lru(struct lruvec *lruvec,
    1637                 :            :                                      struct list_head *list,
    1638                 :            :                                      struct list_head *pages_to_free,
    1639                 :            :                                      enum lru_list lru)
    1640                 :            : {
    1641                 :      33546 :         struct zone *zone = lruvec_zone(lruvec);
    1642                 :            :         unsigned long pgmoved = 0;
    1643                 :          0 :         struct page *page;
    1644                 :            :         int nr_pages;
    1645                 :            : 
    1646         [ +  + ]:      80162 :         while (!list_empty(list)) {
    1647                 :      46616 :                 page = lru_to_page(list);
    1648                 :            :                 lruvec = mem_cgroup_page_lruvec(page, zone);
    1649                 :            : 
    1650                 :            :                 VM_BUG_ON_PAGE(PageLRU(page), page);
    1651                 :            :                 SetPageLRU(page);
    1652                 :            : 
    1653                 :            :                 nr_pages = hpage_nr_pages(page);
    1654                 :            :                 mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
    1655                 :      46616 :                 list_move(&page->lru, &lruvec->lists[lru]);
    1656                 :      46616 :                 pgmoved += nr_pages;
    1657                 :            : 
    1658         [ -  + ]:      46616 :                 if (put_page_testzero(page)) {
    1659                 :            :                         __ClearPageLRU(page);
    1660                 :            :                         __ClearPageActive(page);
    1661                 :            :                         del_page_from_lru_list(page, lruvec, lru);
    1662                 :            : 
    1663         [ #  # ]:          0 :                         if (unlikely(PageCompound(page))) {
    1664                 :            :                                 spin_unlock_irq(&zone->lru_lock);
    1665                 :          0 :                                 (*get_compound_page_dtor(page))(page);
    1666                 :            :                                 spin_lock_irq(&zone->lru_lock);
    1667                 :            :                         } else
    1668                 :            :                                 list_add(&page->lru, pages_to_free);
    1669                 :            :                 }
    1670                 :            :         }
    1671                 :      33546 :         __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
    1672         [ +  + ]:      33546 :         if (!is_active_lru(lru))
    1673                 :            :                 __count_vm_events(PGDEACTIVATE, pgmoved);
    1674                 :      33546 : }
    1675                 :            : 
    1676                 :          0 : static void shrink_active_list(unsigned long nr_to_scan,
    1677                 :            :                                struct lruvec *lruvec,
    1678                 :            :                                struct scan_control *sc,
    1679                 :            :                                enum lru_list lru)
    1680                 :            : {
    1681                 :            :         unsigned long nr_taken;
    1682                 :            :         unsigned long nr_scanned;
    1683                 :            :         unsigned long vm_flags;
    1684                 :      16773 :         LIST_HEAD(l_hold);      /* The pages which were snipped off */
    1685                 :      16773 :         LIST_HEAD(l_active);
    1686                 :      16773 :         LIST_HEAD(l_inactive);
    1687                 :          0 :         struct page *page;
    1688                 :            :         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
    1689                 :            :         unsigned long nr_rotated = 0;
    1690                 :            :         isolate_mode_t isolate_mode = 0;
    1691                 :            :         int file = is_file_lru(lru);
    1692                 :      16773 :         struct zone *zone = lruvec_zone(lruvec);
    1693                 :            : 
    1694                 :      16773 :         lru_add_drain();
    1695                 :            : 
    1696         [ -  + ]:      16772 :         if (!sc->may_unmap)
    1697                 :            :                 isolate_mode |= ISOLATE_UNMAPPED;
    1698         [ -  + ]:      16772 :         if (!sc->may_writepage)
    1699                 :          0 :                 isolate_mode |= ISOLATE_CLEAN;
    1700                 :            : 
    1701                 :            :         spin_lock_irq(&zone->lru_lock);
    1702                 :            : 
    1703                 :      16773 :         nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
    1704                 :            :                                      &nr_scanned, sc, isolate_mode, lru);
    1705                 :            :         if (global_reclaim(sc))
    1706                 :      16773 :                 zone->pages_scanned += nr_scanned;
    1707                 :            : 
    1708                 :      16773 :         reclaim_stat->recent_scanned[file] += nr_taken;
    1709                 :            : 
    1710                 :      16773 :         __count_zone_vm_events(PGREFILL, zone, nr_scanned);
    1711                 :      16773 :         __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
    1712                 :      16773 :         __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
    1713                 :            :         spin_unlock_irq(&zone->lru_lock);
    1714                 :            : 
    1715         [ +  + ]:      63386 :         while (!list_empty(&l_hold)) {
    1716                 :      46613 :                 cond_resched();
    1717                 :      46615 :                 page = lru_to_page(&l_hold);
    1718                 :            :                 list_del(&page->lru);
    1719                 :            : 
    1720         [ -  + ]:      46615 :                 if (unlikely(!page_evictable(page))) {
    1721                 :          0 :                         putback_lru_page(page);
    1722                 :          0 :                         continue;
    1723                 :            :                 }
    1724                 :            : 
    1725         [ -  + ]:      46615 :                 if (unlikely(buffer_heads_over_limit)) {
    1726   [ #  #  #  # ]:          0 :                         if (page_has_private(page) && trylock_page(page)) {
    1727         [ #  # ]:          0 :                                 if (page_has_private(page))
    1728                 :          0 :                                         try_to_release_page(page, 0);
    1729                 :          0 :                                 unlock_page(page);
    1730                 :            :                         }
    1731                 :            :                 }
    1732                 :            : 
    1733         [ +  + ]:      46615 :                 if (page_referenced(page, 0, sc->target_mem_cgroup,
    1734                 :            :                                     &vm_flags)) {
    1735                 :        896 :                         nr_rotated += hpage_nr_pages(page);
    1736                 :            :                         /*
    1737                 :            :                          * Identify referenced, file-backed active pages and
    1738                 :            :                          * give them one more trip around the active list. So
    1739                 :            :                          * that executable code get better chances to stay in
    1740                 :            :                          * memory under moderate memory pressure.  Anon pages
    1741                 :            :                          * are not likely to be evicted by use-once streaming
    1742                 :            :                          * IO, plus JVM can create lots of anon VM_EXEC pages,
    1743                 :            :                          * so we ignore them here.
    1744                 :            :                          */
    1745 [ +  + ][ +  - ]:        896 :                         if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
    1746                 :        871 :                                 list_add(&page->lru, &l_active);
    1747                 :        871 :                                 continue;
    1748                 :            :                         }
    1749                 :            :                 }
    1750                 :            : 
    1751                 :            :                 ClearPageActive(page);  /* we are de-activating */
    1752                 :      45743 :                 list_add(&page->lru, &l_inactive);
    1753                 :            :         }
    1754                 :            : 
    1755                 :            :         /*
    1756                 :            :          * Move pages back to the lru list.
    1757                 :            :          */
    1758                 :            :         spin_lock_irq(&zone->lru_lock);
    1759                 :            :         /*
    1760                 :            :          * Count referenced pages from currently used mappings as rotated,
    1761                 :            :          * even though only some of them are actually re-activated.  This
    1762                 :            :          * helps balance scan pressure between file and anonymous pages in
    1763                 :            :          * get_scan_ratio.
    1764                 :            :          */
    1765                 :      16773 :         reclaim_stat->recent_rotated[file] += nr_rotated;
    1766                 :            : 
    1767                 :      16773 :         move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
    1768                 :      16773 :         move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
    1769                 :      16773 :         __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
    1770                 :            :         spin_unlock_irq(&zone->lru_lock);
    1771                 :            : 
    1772                 :      16773 :         free_hot_cold_page_list(&l_hold, 1);
    1773                 :      16773 : }
    1774                 :            : 
    1775                 :            : #ifdef CONFIG_SWAP
    1776                 :            : static int inactive_anon_is_low_global(struct zone *zone)
    1777                 :            : {
    1778                 :            :         unsigned long active, inactive;
    1779                 :            : 
    1780                 :            :         active = zone_page_state(zone, NR_ACTIVE_ANON);
    1781                 :            :         inactive = zone_page_state(zone, NR_INACTIVE_ANON);
    1782                 :            : 
    1783 [ #  # ][ #  # ]:          0 :         if (inactive * zone->inactive_ratio < active)
                 [ #  # ]
    1784                 :            :                 return 1;
    1785                 :            : 
    1786                 :            :         return 0;
    1787                 :            : }
    1788                 :            : 
    1789                 :            : /**
    1790                 :            :  * inactive_anon_is_low - check if anonymous pages need to be deactivated
    1791                 :            :  * @lruvec: LRU vector to check
    1792                 :            :  *
    1793                 :            :  * Returns true if the zone does not have enough inactive anon pages,
    1794                 :            :  * meaning some active anon pages need to be deactivated.
    1795                 :            :  */
    1796                 :            : static int inactive_anon_is_low(struct lruvec *lruvec)
    1797                 :            : {
    1798                 :            :         /*
    1799                 :            :          * If we don't have swap space, anonymous page deactivation
    1800                 :            :          * is pointless.
    1801                 :            :          */
    1802   [ #  #  -  + ]:     161285 :         if (!total_swap_pages)
                 [ #  # ]
    1803                 :            :                 return 0;
    1804                 :            : 
    1805                 :            :         if (!mem_cgroup_disabled())
    1806                 :            :                 return mem_cgroup_inactive_anon_is_low(lruvec);
    1807                 :            : 
    1808                 :            :         return inactive_anon_is_low_global(lruvec_zone(lruvec));
    1809                 :            : }
    1810                 :            : #else
    1811                 :            : static inline int inactive_anon_is_low(struct lruvec *lruvec)
    1812                 :            : {
    1813                 :            :         return 0;
    1814                 :            : }
    1815                 :            : #endif
    1816                 :            : 
    1817                 :            : /**
    1818                 :            :  * inactive_file_is_low - check if file pages need to be deactivated
    1819                 :            :  * @lruvec: LRU vector to check
    1820                 :            :  *
    1821                 :            :  * When the system is doing streaming IO, memory pressure here
    1822                 :            :  * ensures that active file pages get deactivated, until more
    1823                 :            :  * than half of the file pages are on the inactive list.
    1824                 :            :  *
    1825                 :            :  * Once we get to that situation, protect the system's working
    1826                 :            :  * set from being evicted by disabling active file page aging.
    1827                 :            :  *
    1828                 :            :  * This uses a different ratio than the anonymous pages, because
    1829                 :            :  * the page cache uses a use-once replacement algorithm.
    1830                 :            :  */
    1831                 :            : static int inactive_file_is_low(struct lruvec *lruvec)
    1832                 :            : {
    1833                 :            :         unsigned long inactive;
    1834                 :            :         unsigned long active;
    1835                 :            : 
    1836                 :            :         inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
    1837                 :            :         active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
    1838                 :            : 
    1839                 :      31814 :         return active > inactive;
    1840                 :            : }
    1841                 :            : 
    1842                 :          0 : static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
    1843                 :            : {
    1844         [ +  - ]:      31814 :         if (is_file_lru(lru))
    1845                 :      31814 :                 return inactive_file_is_low(lruvec);
    1846                 :            :         else
    1847                 :          0 :                 return inactive_anon_is_low(lruvec);
    1848                 :            : }
    1849                 :            : 
    1850                 :          0 : static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
    1851                 :            :                                  struct lruvec *lruvec, struct scan_control *sc)
    1852                 :            : {
    1853         [ +  + ]:      67653 :         if (is_active_lru(lru)) {
    1854         [ +  + ]:      31814 :                 if (inactive_list_is_low(lruvec, lru))
    1855                 :      16773 :                         shrink_active_list(nr_to_scan, lruvec, sc, lru);
    1856                 :            :                 return 0;
    1857                 :            :         }
    1858                 :            : 
    1859                 :      35839 :         return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
    1860                 :            : }
    1861                 :            : 
    1862                 :            : static int vmscan_swappiness(struct scan_control *sc)
    1863                 :            : {
    1864                 :            :         if (global_reclaim(sc))
    1865                 :          0 :                 return vm_swappiness;
    1866                 :            :         return mem_cgroup_swappiness(sc->target_mem_cgroup);
    1867                 :            : }
    1868                 :            : 
    1869                 :            : enum scan_balance {
    1870                 :            :         SCAN_EQUAL,
    1871                 :            :         SCAN_FRACT,
    1872                 :            :         SCAN_ANON,
    1873                 :            :         SCAN_FILE,
    1874                 :            : };
    1875                 :            : 
    1876                 :            : /*
    1877                 :            :  * Determine how aggressively the anon and file LRU lists should be
    1878                 :            :  * scanned.  The relative value of each set of LRU lists is determined
    1879                 :            :  * by looking at the fraction of the pages scanned we did rotate back
    1880                 :            :  * onto the active list instead of evict.
    1881                 :            :  *
    1882                 :            :  * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
    1883                 :            :  * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
    1884                 :            :  */
    1885                 :          0 : static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
    1886                 :            :                            unsigned long *nr)
    1887                 :            : {
    1888                 :            :         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
    1889                 :            :         u64 fraction[2];
    1890                 :            :         u64 denominator = 0;    /* gcc */
    1891                 :     161206 :         struct zone *zone = lruvec_zone(lruvec);
    1892                 :            :         unsigned long anon_prio, file_prio;
    1893                 :            :         enum scan_balance scan_balance;
    1894                 :            :         unsigned long anon, file, free;
    1895                 :            :         bool force_scan = false;
    1896                 :            :         unsigned long ap, fp;
    1897                 :            :         enum lru_list lru;
    1898                 :            : 
    1899                 :            :         /*
    1900                 :            :          * If the zone or memcg is small, nr[l] can be 0.  This
    1901                 :            :          * results in no scanning on this priority and a potential
    1902                 :            :          * priority drop.  Global direct reclaim can go to the next
    1903                 :            :          * zone and tends to have no problems. Global kswapd is for
    1904                 :            :          * zone balancing and it needs to scan a minimum amount. When
    1905                 :            :          * reclaiming for a memcg, a priority drop can cause high
    1906                 :            :          * latencies, so it's better to scan a minimum amount there as
    1907                 :            :          * well.
    1908                 :            :          */
    1909   [ +  +  +  + ]:     210890 :         if (current_is_kswapd() && !zone_reclaimable(zone))
    1910                 :            :                 force_scan = true;
    1911                 :            :         if (!global_reclaim(sc))
    1912                 :            :                 force_scan = true;
    1913                 :            : 
    1914                 :            :         /* If we have no swap space, do not bother scanning anon pages. */
    1915    [ + ][ -  + ]:     161206 :         if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
    1916                 :            :                 scan_balance = SCAN_FILE;
    1917                 :            :                 goto out;
    1918                 :            :         }
    1919                 :            : 
    1920                 :            :         /*
    1921                 :            :          * Global reclaim will swap to prevent OOM even with no
    1922                 :            :          * swappiness, but memcg users want to use this knob to
    1923                 :            :          * disable swapping for individual groups completely when
    1924                 :            :          * using the memory controller's swap limit feature would be
    1925                 :            :          * too expensive.
    1926                 :            :          */
    1927                 :            :         if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
    1928                 :            :                 scan_balance = SCAN_FILE;
    1929                 :            :                 goto out;
    1930                 :            :         }
    1931                 :            : 
    1932                 :            :         /*
    1933                 :            :          * Do not apply any pressure balancing cleverness when the
    1934                 :            :          * system is close to OOM, scan both anon and file equally
    1935                 :            :          * (unless the swappiness setting disagrees with swapping).
    1936                 :            :          */
    1937 [ #  # ][ #  # ]:          0 :         if (!sc->priority && vmscan_swappiness(sc)) {
    1938                 :            :                 scan_balance = SCAN_EQUAL;
    1939                 :            :                 goto out;
    1940                 :            :         }
    1941                 :            : 
    1942                 :          0 :         anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
    1943                 :            :                 get_lru_size(lruvec, LRU_INACTIVE_ANON);
    1944                 :          0 :         file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
    1945                 :            :                 get_lru_size(lruvec, LRU_INACTIVE_FILE);
    1946                 :            : 
    1947                 :            :         /*
    1948                 :            :          * If it's foreseeable that reclaiming the file cache won't be
    1949                 :            :          * enough to get the zone back into a desirable shape, we have
    1950                 :            :          * to swap.  Better start now and leave the - probably heavily
    1951                 :            :          * thrashing - remaining file pages alone.
    1952                 :            :          */
    1953                 :            :         if (global_reclaim(sc)) {
    1954                 :            :                 free = zone_page_state(zone, NR_FREE_PAGES);
    1955         [ #  # ]:          0 :                 if (unlikely(file + free <= high_wmark_pages(zone))) {
    1956                 :            :                         scan_balance = SCAN_ANON;
    1957                 :            :                         goto out;
    1958                 :            :                 }
    1959                 :            :         }
    1960                 :            : 
    1961                 :            :         /*
    1962                 :            :          * There is enough inactive page cache, do not reclaim
    1963                 :            :          * anything from the anonymous working set right now.
    1964                 :            :          */
    1965         [ #  # ]:          0 :         if (!inactive_file_is_low(lruvec)) {
    1966                 :            :                 scan_balance = SCAN_FILE;
    1967                 :            :                 goto out;
    1968                 :            :         }
    1969                 :            : 
    1970                 :            :         scan_balance = SCAN_FRACT;
    1971                 :            : 
    1972                 :            :         /*
    1973                 :            :          * With swappiness at 100, anonymous and file have the same priority.
    1974                 :            :          * This scanning priority is essentially the inverse of IO cost.
    1975                 :            :          */
    1976                 :          0 :         anon_prio = vmscan_swappiness(sc);
    1977                 :          0 :         file_prio = 200 - anon_prio;
    1978                 :            : 
    1979                 :            :         /*
    1980                 :            :          * OK, so we have swap space and a fair amount of page cache
    1981                 :            :          * pages.  We use the recently rotated / recently scanned
    1982                 :            :          * ratios to determine how valuable each cache is.
    1983                 :            :          *
    1984                 :            :          * Because workloads change over time (and to avoid overflow)
    1985                 :            :          * we keep these statistics as a floating average, which ends
    1986                 :            :          * up weighing recent references more than old ones.
    1987                 :            :          *
    1988                 :            :          * anon in [0], file in [1]
    1989                 :            :          */
    1990                 :            :         spin_lock_irq(&zone->lru_lock);
    1991         [ #  # ]:          0 :         if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
    1992                 :          0 :                 reclaim_stat->recent_scanned[0] /= 2;
    1993                 :          0 :                 reclaim_stat->recent_rotated[0] /= 2;
    1994                 :            :         }
    1995                 :            : 
    1996         [ #  # ]:          0 :         if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
    1997                 :          0 :                 reclaim_stat->recent_scanned[1] /= 2;
    1998                 :          0 :                 reclaim_stat->recent_rotated[1] /= 2;
    1999                 :            :         }
    2000                 :            : 
    2001                 :            :         /*
    2002                 :            :          * The amount of pressure on anon vs file pages is inversely
    2003                 :            :          * proportional to the fraction of recently scanned pages on
    2004                 :            :          * each list that were recently referenced and in active use.
    2005                 :            :          */
    2006                 :          0 :         ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
    2007                 :          0 :         ap /= reclaim_stat->recent_rotated[0] + 1;
    2008                 :            : 
    2009                 :          0 :         fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
    2010                 :          0 :         fp /= reclaim_stat->recent_rotated[1] + 1;
    2011                 :            :         spin_unlock_irq(&zone->lru_lock);
    2012                 :            : 
    2013                 :          0 :         fraction[0] = ap;
    2014                 :          0 :         fraction[1] = fp;
    2015                 :     161206 :         denominator = ap + fp + 1;
    2016                 :            : out:
    2017         [ +  + ]:     806168 :         for_each_evictable_lru(lru) {
    2018                 :            :                 int file = is_file_lru(lru);
    2019                 :            :                 unsigned long size;
    2020                 :            :                 unsigned long scan;
    2021                 :            : 
    2022                 :            :                 size = get_lru_size(lruvec, lru);
    2023                 :     644896 :                 scan = size >> sc->priority;
    2024                 :            : 
    2025         [ +  + ]:     644896 :                 if (!scan && force_scan)
    2026                 :       8130 :                         scan = min(size, SWAP_CLUSTER_MAX);
    2027                 :            : 
    2028      [ -  +  + ]:     644896 :                 switch (scan_balance) {
    2029                 :            :                 case SCAN_EQUAL:
    2030                 :            :                         /* Scan lists relative to size */
    2031                 :            :                         break;
    2032                 :            :                 case SCAN_FRACT:
    2033                 :            :                         /*
    2034                 :            :                          * Scan types proportional to swappiness and
    2035                 :            :                          * their relative recent reclaim efficiency.
    2036                 :            :                          */
    2037                 :          0 :                         scan = div64_u64(scan * fraction[file], denominator);
    2038                 :            :                         break;
    2039                 :            :                 case SCAN_FILE:
    2040                 :            :                 case SCAN_ANON:
    2041                 :            :                         /* Scan one type exclusively */
    2042         [ +  + ]:     644919 :                         if ((scan_balance == SCAN_FILE) != file)
    2043                 :            :                                 scan = 0;
    2044                 :            :                         break;
    2045                 :            :                 default:
    2046                 :            :                         /* Look ma, no brain */
    2047                 :          0 :                         BUG();
    2048                 :            :                 }
    2049                 :     644962 :                 nr[lru] = scan;
    2050                 :            :         }
    2051                 :     161272 : }
    2052                 :            : 
    2053                 :            : /*
    2054                 :            :  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
    2055                 :            :  */
    2056                 :          0 : static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
    2057                 :            : {
    2058                 :            :         unsigned long nr[NR_LRU_LISTS];
    2059                 :            :         unsigned long targets[NR_LRU_LISTS];
    2060                 :            :         unsigned long nr_to_scan;
    2061                 :            :         enum lru_list lru;
    2062                 :            :         unsigned long nr_reclaimed = 0;
    2063                 :     161260 :         unsigned long nr_to_reclaim = sc->nr_to_reclaim;
    2064                 :            :         struct blk_plug plug;
    2065                 :            :         bool scan_adjusted = false;
    2066                 :            : 
    2067                 :     161260 :         get_scan_count(lruvec, sc, nr);
    2068                 :            : 
    2069                 :            :         /* Record the original scan target for proportional adjustments later */
    2070                 :     161268 :         memcpy(targets, nr, sizeof(nr));
    2071                 :            : 
    2072                 :     161268 :         blk_start_plug(&plug);
    2073 [ +  + ][ +  + ]:     209264 :         while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                 [ +  + ]
    2074                 :     177449 :                                         nr[LRU_INACTIVE_FILE]) {
    2075                 :            :                 unsigned long nr_anon, nr_file, percentage;
    2076                 :            :                 unsigned long nr_scanned;
    2077                 :            : 
    2078         [ +  + ]:     239893 :                 for_each_evictable_lru(lru) {
    2079         [ +  + ]:     191907 :                         if (nr[lru]) {
    2080                 :          0 :                                 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
    2081                 :          0 :                                 nr[lru] -= nr_to_scan;
    2082                 :            : 
    2083                 :      67651 :                                 nr_reclaimed += shrink_list(lru, nr_to_scan,
    2084                 :            :                                                             lruvec, sc);
    2085                 :            :                         }
    2086                 :            :                 }
    2087                 :            : 
    2088         [ +  + ]:      47986 :                 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
    2089                 :      47983 :                         continue;
    2090                 :            : 
    2091                 :            :                 /*
    2092                 :            :                  * For global direct reclaim, reclaim only the number of pages
    2093                 :            :                  * requested. Less care is taken to scan proportionally as it
    2094                 :            :                  * is more important to minimise direct reclaim stall latency
    2095                 :            :                  * than it is to properly age the LRU lists.
    2096                 :            :                  */
    2097         [ -  + ]:          3 :                 if (global_reclaim(sc) && !current_is_kswapd())
    2098                 :            :                         break;
    2099                 :            : 
    2100                 :            :                 /*
    2101                 :            :                  * For kswapd and memcg, reclaim at least the number of pages
    2102                 :            :                  * requested. Ensure that the anon and file LRUs shrink
    2103                 :            :                  * proportionally what was requested by get_scan_count(). We
    2104                 :            :                  * stop reclaiming one LRU and reduce the amount scanning
    2105                 :            :                  * proportional to the original scan target.
    2106                 :            :                  */
    2107                 :          0 :                 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
    2108                 :          0 :                 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
    2109                 :            : 
    2110         [ #  # ]:          0 :                 if (nr_file > nr_anon) {
    2111                 :          0 :                         unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
    2112                 :          0 :                                                 targets[LRU_ACTIVE_ANON] + 1;
    2113                 :            :                         lru = LRU_BASE;
    2114                 :          0 :                         percentage = nr_anon * 100 / scan_target;
    2115                 :            :                 } else {
    2116                 :          0 :                         unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
    2117                 :          0 :                                                 targets[LRU_ACTIVE_FILE] + 1;
    2118                 :            :                         lru = LRU_FILE;
    2119                 :          0 :                         percentage = nr_file * 100 / scan_target;
    2120                 :            :                 }
    2121                 :            : 
    2122                 :            :                 /* Stop scanning the smaller of the LRU */
    2123                 :          0 :                 nr[lru] = 0;
    2124                 :          0 :                 nr[lru + LRU_ACTIVE] = 0;
    2125                 :            : 
    2126                 :            :                 /*
    2127                 :            :                  * Recalculate the other LRU scan count based on its original
    2128                 :            :                  * scan target and the percentage scanning already complete
    2129                 :            :                  */
    2130         [ #  # ]:          0 :                 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
    2131                 :          0 :                 nr_scanned = targets[lru] - nr[lru];
    2132                 :          0 :                 nr[lru] = targets[lru] * (100 - percentage) / 100;
    2133                 :          0 :                 nr[lru] -= min(nr[lru], nr_scanned);
    2134                 :            : 
    2135                 :          0 :                 lru += LRU_ACTIVE;
    2136                 :          0 :                 nr_scanned = targets[lru] - nr[lru];
    2137                 :          0 :                 nr[lru] = targets[lru] * (100 - percentage) / 100;
    2138                 :      47983 :                 nr[lru] -= min(nr[lru], nr_scanned);
    2139                 :            : 
    2140                 :            :                 scan_adjusted = true;
    2141                 :            :         }
    2142                 :     161284 :         blk_finish_plug(&plug);
    2143                 :     161285 :         sc->nr_reclaimed += nr_reclaimed;
    2144                 :            : 
    2145                 :            :         /*
    2146                 :            :          * Even if we did not try to evict anon pages at all, we want to
    2147                 :            :          * rebalance the anon lru active/inactive ratio.
    2148                 :            :          */
    2149         [ -  + ]:     161285 :         if (inactive_anon_is_low(lruvec))
    2150                 :          0 :                 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
    2151                 :            :                                    sc, LRU_ACTIVE_ANON);
    2152                 :            : 
    2153                 :     161285 :         throttle_vm_writeout(sc->gfp_mask);
    2154                 :     161269 : }
    2155                 :            : 
    2156                 :            : /* Use reclaim/compaction for costly allocs or under memory pressure */
    2157                 :     161271 : static bool in_reclaim_compaction(struct scan_control *sc)
    2158                 :            : {
    2159 [ +  + ][ +  - ]:     161271 :         if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
    2160         [ -  + ]:     161271 :                         (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
    2161                 :          2 :                          sc->priority < DEF_PRIORITY - 2))
    2162                 :            :                 return true;
    2163                 :            : 
    2164                 :            :         return false;
    2165                 :            : }
    2166                 :            : 
    2167                 :            : /*
    2168                 :            :  * Reclaim/compaction is used for high-order allocation requests. It reclaims
    2169                 :            :  * order-0 pages before compacting the zone. should_continue_reclaim() returns
    2170                 :            :  * true if more pages should be reclaimed such that when the page allocator
    2171                 :            :  * calls try_to_compact_zone() that it will have enough free pages to succeed.
    2172                 :            :  * It will give up earlier than that if there is difficulty reclaiming pages.
    2173                 :            :  */
    2174                 :            : static inline bool should_continue_reclaim(struct zone *zone,
    2175                 :            :                                         unsigned long nr_reclaimed,
    2176                 :            :                                         unsigned long nr_scanned,
    2177                 :     161257 :                                         struct scan_control *sc)
    2178                 :            : {
    2179                 :            :         unsigned long pages_for_compaction;
    2180                 :            :         unsigned long inactive_lru_pages;
    2181                 :            : 
    2182                 :            :         /* If not in reclaim/compaction mode, stop */
    2183         [ -  + ]:     161257 :         if (!in_reclaim_compaction(sc))
    2184                 :            :                 return false;
    2185                 :            : 
    2186                 :            :         /* Consider stopping depending on scan and reclaim activity */
    2187         [ #  # ]:          0 :         if (sc->gfp_mask & __GFP_REPEAT) {
    2188                 :            :                 /*
    2189                 :            :                  * For __GFP_REPEAT allocations, stop reclaiming if the
    2190                 :            :                  * full LRU list has been scanned and we are still failing
    2191                 :            :                  * to reclaim pages. This full LRU scan is potentially
    2192                 :            :                  * expensive but a __GFP_REPEAT caller really wants to succeed
    2193                 :            :                  */
    2194         [ #  # ]:          0 :                 if (!nr_reclaimed && !nr_scanned)
    2195                 :            :                         return false;
    2196                 :            :         } else {
    2197                 :            :                 /*
    2198                 :            :                  * For non-__GFP_REPEAT allocations which can presumably
    2199                 :            :                  * fail without consequence, stop if we failed to reclaim
    2200                 :            :                  * any pages from the last SWAP_CLUSTER_MAX number of
    2201                 :            :                  * pages that were scanned. This will return to the
    2202                 :            :                  * caller faster at the risk reclaim/compaction and
    2203                 :            :                  * the resulting allocation attempt fails
    2204                 :            :                  */
    2205         [ #  # ]:          0 :                 if (!nr_reclaimed)
    2206                 :            :                         return false;
    2207                 :            :         }
    2208                 :            : 
    2209                 :            :         /*
    2210                 :            :          * If we have not reclaimed enough pages for compaction and the
    2211                 :            :          * inactive lists are large enough, continue reclaiming
    2212                 :            :          */
    2213                 :         19 :         pages_for_compaction = (2UL << sc->order);
    2214                 :            :         inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
    2215         [ -  + ]:         19 :         if (get_nr_swap_pages() > 0)
    2216                 :          0 :                 inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
    2217    [ - ][ #  # ]:         19 :         if (sc->nr_reclaimed < pages_for_compaction &&
    2218                 :            :                         inactive_lru_pages > pages_for_compaction)
    2219                 :            :                 return true;
    2220                 :            : 
    2221                 :            :         /* If compaction would go ahead or the allocation would succeed, stop */
    2222         [ #  # ]:          0 :         switch (compaction_suitable(zone, sc->order)) {
    2223                 :            :         case COMPACT_PARTIAL:
    2224                 :            :         case COMPACT_CONTINUE:
    2225                 :            :                 return false;
    2226                 :            :         default:
    2227                 :            :                 return true;
    2228                 :            :         }
    2229                 :            : }
    2230                 :            : 
    2231                 :     161260 : static void shrink_zone(struct zone *zone, struct scan_control *sc)
    2232                 :            : {
    2233                 :            :         unsigned long nr_reclaimed, nr_scanned;
    2234                 :            : 
    2235                 :            :         do {
    2236                 :            :                 struct mem_cgroup *root = sc->target_mem_cgroup;
    2237                 :            :                 struct mem_cgroup_reclaim_cookie reclaim = {
    2238                 :            :                         .zone = zone,
    2239                 :            :                         .priority = sc->priority,
    2240                 :            :                 };
    2241                 :            :                 struct mem_cgroup *memcg;
    2242                 :            : 
    2243                 :     161260 :                 nr_reclaimed = sc->nr_reclaimed;
    2244                 :     161260 :                 nr_scanned = sc->nr_scanned;
    2245                 :            : 
    2246                 :            :                 memcg = mem_cgroup_iter(root, NULL, &reclaim);
    2247                 :            :                 do {
    2248                 :            :                         struct lruvec *lruvec;
    2249                 :            : 
    2250                 :     161260 :                         lruvec = mem_cgroup_zone_lruvec(zone, memcg);
    2251                 :            : 
    2252                 :     161260 :                         shrink_lruvec(lruvec, sc);
    2253                 :            : 
    2254                 :            :                         /*
    2255                 :            :                          * Direct reclaim and kswapd have to scan all memory
    2256                 :            :                          * cgroups to fulfill the overall scan target for the
    2257                 :            :                          * zone.
    2258                 :            :                          *
    2259                 :            :                          * Limit reclaim, on the other hand, only cares about
    2260                 :            :                          * nr_to_reclaim pages to be reclaimed and it will
    2261                 :            :                          * retry with decreasing priority if one round over the
    2262                 :            :                          * whole hierarchy is not sufficient.
    2263                 :            :                          */
    2264                 :            :                         if (!global_reclaim(sc) &&
    2265                 :            :                                         sc->nr_reclaimed >= sc->nr_to_reclaim) {
    2266                 :            :                                 mem_cgroup_iter_break(root, memcg);
    2267                 :            :                                 break;
    2268                 :            :                         }
    2269                 :            :                         memcg = mem_cgroup_iter(root, memcg, &reclaim);
    2270                 :            :                 } while (memcg);
    2271                 :            : 
    2272                 :            :                 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
    2273                 :            :                            sc->nr_scanned - nr_scanned,
    2274                 :            :                            sc->nr_reclaimed - nr_reclaimed);
    2275                 :            : 
    2276                 :     322514 :         } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
    2277         [ -  + ]:     161242 :                                          sc->nr_scanned - nr_scanned, sc));
    2278                 :     161242 : }
    2279                 :            : 
    2280                 :            : /* Returns true if compaction should go ahead for a high-order request */
    2281                 :            : static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
    2282                 :            : {
    2283                 :            :         unsigned long balance_gap, watermark;
    2284                 :            :         bool watermark_ok;
    2285                 :            : 
    2286                 :            :         /* Do not consider compaction for orders reclaim is meant to satisfy */
    2287         [ -  + ]:     111622 :         if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
    2288                 :            :                 return false;
    2289                 :            : 
    2290                 :            :         /*
    2291                 :            :          * Compaction takes time to run and there are potentially other
    2292                 :            :          * callers using the pages just freed. Continue reclaiming until
    2293                 :            :          * there is a buffer of free pages available to give compaction
    2294                 :            :          * a reasonable chance of completing and allocating the page
    2295                 :            :          */
    2296                 :          0 :         balance_gap = min(low_wmark_pages(zone),
    2297                 :            :                 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
    2298                 :            :                         KSWAPD_ZONE_BALANCE_GAP_RATIO);
    2299                 :          0 :         watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
    2300                 :          0 :         watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
    2301                 :            : 
    2302                 :            :         /*
    2303                 :            :          * If compaction is deferred, reclaim up to a point where
    2304                 :            :          * compaction will have a chance of success when re-enabled
    2305                 :            :          */
    2306         [ #  # ]:          0 :         if (compaction_deferred(zone, sc->order))
    2307                 :            :                 return watermark_ok;
    2308                 :            : 
    2309                 :            :         /* If compaction is not ready to start, keep reclaiming */
    2310         [ #  # ]:          0 :         if (!compaction_suitable(zone, sc->order))
    2311                 :            :                 return false;
    2312                 :            : 
    2313                 :            :         return watermark_ok;
    2314                 :            : }
    2315                 :            : 
    2316                 :            : /*
    2317                 :            :  * This is the direct reclaim path, for page-allocating processes.  We only
    2318                 :            :  * try to reclaim pages from zones which will satisfy the caller's allocation
    2319                 :            :  * request.
    2320                 :            :  *
    2321                 :            :  * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
    2322                 :            :  * Because:
    2323                 :            :  * a) The caller may be trying to free *extra* pages to satisfy a higher-order
    2324                 :            :  *    allocation or
    2325                 :            :  * b) The target zone may be at high_wmark_pages(zone) but the lower zones
    2326                 :            :  *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
    2327                 :            :  *    zone defense algorithm.
    2328                 :            :  *
    2329                 :            :  * If a zone is deemed to be full of pinned pages then just give it a light
    2330                 :            :  * scan then give up on it.
    2331                 :            :  *
    2332                 :            :  * This function returns true if a zone is being reclaimed for a costly
    2333                 :            :  * high-order allocation and compaction is ready to begin. This indicates to
    2334                 :            :  * the caller that it should consider retrying the allocation instead of
    2335                 :            :  * further reclaim.
    2336                 :            :  */
    2337                 :          0 : static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
    2338                 :            : {
    2339                 :            :         struct zoneref *z;
    2340                 :            :         struct zone *zone;
    2341                 :            :         unsigned long nr_soft_reclaimed;
    2342                 :            :         unsigned long nr_soft_scanned;
    2343                 :            :         bool aborted_reclaim = false;
    2344                 :            : 
    2345                 :            :         /*
    2346                 :            :          * If the number of buffer_heads in the machine exceeds the maximum
    2347                 :            :          * allowed level, force direct reclaim to scan the highmem zone as
    2348                 :            :          * highmem pages could be pinning lowmem pages storing buffer_heads
    2349                 :            :          */
    2350         [ -  + ]:      76089 :         if (buffer_heads_over_limit)
    2351                 :          0 :                 sc->gfp_mask |= __GFP_HIGHMEM;
    2352                 :            : 
    2353         [ +  + ]:     152193 :         for_each_zone_zonelist_nodemask(zone, z, zonelist,
    2354                 :            :                                         gfp_zone(sc->gfp_mask), sc->nodemask) {
    2355            [ + ]:     152191 :                 if (!populated_zone(zone))
    2356                 :          0 :                         continue;
    2357                 :            :                 /*
    2358                 :            :                  * Take care memory controller reclaiming has small influence
    2359                 :            :                  * to global LRU.
    2360                 :            :                  */
    2361                 :            :                 if (global_reclaim(sc)) {
    2362                 :            :                         if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    2363                 :            :                                 continue;
    2364   [ +  +  +  + ]:     368780 :                         if (sc->priority != DEF_PRIORITY &&
    2365                 :            :                             !zone_reclaimable(zone))
    2366                 :      40581 :                                 continue;       /* Let kswapd poll it */
    2367                 :            :                         if (IS_ENABLED(CONFIG_COMPACTION)) {
    2368                 :            :                                 /*
    2369                 :            :                                  * If we already have plenty of memory free for
    2370                 :            :                                  * compaction in this zone, don't free any more.
    2371                 :            :                                  * Even though compaction is invoked for any
    2372                 :            :                                  * non-zero order, only frequent costly order
    2373                 :            :                                  * reclamation is disruptive enough to become a
    2374                 :            :                                  * noticeable problem, like transparent huge
    2375                 :            :                                  * page allocations.
    2376                 :            :                                  */
    2377         [ -  + ]:     111622 :                                 if (compaction_ready(zone, sc)) {
    2378                 :            :                                         aborted_reclaim = true;
    2379                 :          0 :                                         continue;
    2380                 :            :                                 }
    2381                 :            :                         }
    2382                 :            :                         /*
    2383                 :            :                          * This steals pages from memory cgroups over softlimit
    2384                 :            :                          * and returns the number of reclaimed pages and
    2385                 :            :                          * scanned pages. This works for global memory pressure
    2386                 :            :                          * and balancing, not for a memcg's limit.
    2387                 :            :                          */
    2388                 :            :                         nr_soft_scanned = 0;
    2389                 :            :                         nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
    2390                 :            :                                                 sc->order, sc->gfp_mask,
    2391                 :            :                                                 &nr_soft_scanned);
    2392                 :            :                         sc->nr_reclaimed += nr_soft_reclaimed;
    2393                 :            :                         sc->nr_scanned += nr_soft_scanned;
    2394                 :            :                         /* need some check for avoid more shrink_zone() */
    2395                 :            :                 }
    2396                 :            : 
    2397                 :     111622 :                 shrink_zone(zone, sc);
    2398                 :            :         }
    2399                 :            : 
    2400                 :      76093 :         return aborted_reclaim;
    2401                 :            : }
    2402                 :            : 
    2403                 :            : /* All zones in zonelist are unreclaimable? */
    2404                 :       5733 : static bool all_unreclaimable(struct zonelist *zonelist,
    2405                 :            :                 struct scan_control *sc)
    2406                 :            : {
    2407                 :            :         struct zoneref *z;
    2408                 :            :         struct zone *zone;
    2409                 :            : 
    2410         [ +  + ]:       7726 :         for_each_zone_zonelist_nodemask(zone, z, zonelist,
    2411                 :            :                         gfp_zone(sc->gfp_mask), sc->nodemask) {
    2412         [ -  + ]:       6817 :                 if (!populated_zone(zone))
    2413                 :          0 :                         continue;
    2414                 :            :                 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    2415                 :            :                         continue;
    2416         [ +  + ]:       6817 :                 if (zone_reclaimable(zone))
    2417                 :            :                         return false;
    2418                 :            :         }
    2419                 :            : 
    2420                 :            :         return true;
    2421                 :            : }
    2422                 :            : 
    2423                 :            : /*
    2424                 :            :  * This is the main entry point to direct page reclaim.
    2425                 :            :  *
    2426                 :            :  * If a full scan of the inactive list fails to free enough memory then we
    2427                 :            :  * are "out of memory" and something needs to be killed.
    2428                 :            :  *
    2429                 :            :  * If the caller is !__GFP_FS then the probability of a failure is reasonably
    2430                 :            :  * high - the zone may be full of dirty or under-writeback pages, which this
    2431                 :            :  * caller can't do much about.  We kick the writeback threads and take explicit
    2432                 :            :  * naps in the hope that some of these pages can be written.  But if the
    2433                 :            :  * allocating task holds filesystem locks which prevent writeout this might not
    2434                 :            :  * work, and the allocation attempt will fail.
    2435                 :            :  *
    2436                 :            :  * returns:     0, if no pages reclaimed
    2437                 :            :  *              else, the number of pages reclaimed
    2438                 :            :  */
    2439                 :          0 : static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
    2440                 :            :                                         struct scan_control *sc,
    2441                 :            :                                         struct shrink_control *shrink)
    2442                 :            : {
    2443                 :            :         unsigned long total_scanned = 0;
    2444                 :       5856 :         struct reclaim_state *reclaim_state = current->reclaim_state;
    2445                 :            :         struct zoneref *z;
    2446                 :            :         struct zone *zone;
    2447                 :            :         unsigned long writeback_threshold;
    2448                 :            :         bool aborted_reclaim;
    2449                 :            : 
    2450                 :            :         delayacct_freepages_start();
    2451                 :            : 
    2452                 :            :         if (global_reclaim(sc))
    2453                 :            :                 count_vm_event(ALLOCSTALL);
    2454                 :            : 
    2455                 :            :         do {
    2456                 :            :                 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
    2457                 :            :                                 sc->priority);
    2458                 :      76097 :                 sc->nr_scanned = 0;
    2459                 :      76097 :                 aborted_reclaim = shrink_zones(zonelist, sc);
    2460                 :            : 
    2461                 :            :                 /*
    2462                 :            :                  * Don't shrink slabs when reclaiming memory from over limit
    2463                 :            :                  * cgroups but do shrink slab at least once when aborting
    2464                 :            :                  * reclaim for compaction to avoid unevenly scanning file/anon
    2465                 :            :                  * LRU pages over slab pages.
    2466                 :            :                  */
    2467                 :            :                 if (global_reclaim(sc)) {
    2468                 :            :                         unsigned long lru_pages = 0;
    2469                 :            : 
    2470                 :            :                         nodes_clear(shrink->nodes_to_scan);
    2471         [ +  + ]:     228313 :                         for_each_zone_zonelist(zone, z, zonelist,
    2472                 :            :                                         gfp_zone(sc->gfp_mask)) {
    2473                 :            :                                 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    2474                 :            :                                         continue;
    2475                 :            : 
    2476                 :     152202 :                                 lru_pages += zone_reclaimable_pages(zone);
    2477                 :            :                                 node_set(zone_to_nid(zone),
    2478                 :            :                                          shrink->nodes_to_scan);
    2479                 :            :                         }
    2480                 :            : 
    2481                 :      76106 :                         shrink_slab(shrink, sc->nr_scanned, lru_pages);
    2482         [ +  - ]:      76106 :                         if (reclaim_state) {
    2483                 :      76106 :                                 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
    2484                 :      76106 :                                 reclaim_state->reclaimed_slab = 0;
    2485                 :            :                         }
    2486                 :            :                 }
    2487                 :      76106 :                 total_scanned += sc->nr_scanned;
    2488         [ +  + ]:      76106 :                 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
    2489                 :            :                         goto out;
    2490                 :            : 
    2491                 :            :                 /*
    2492                 :            :                  * If we're getting trouble reclaiming, start doing
    2493                 :            :                  * writepage even in laptop mode.
    2494                 :            :                  */
    2495         [ +  + ]:      76081 :                 if (sc->priority < DEF_PRIORITY - 2)
    2496                 :      58520 :                         sc->may_writepage = 1;
    2497                 :            : 
    2498                 :            :                 /*
    2499                 :            :                  * Try to write back as many pages as we just scanned.  This
    2500                 :            :                  * tends to cause slow streaming writers to write data to the
    2501                 :            :                  * disk smoothly, at the dirtying rate, which is nice.   But
    2502                 :            :                  * that's undesirable in laptop mode, where we *want* lumpy
    2503                 :            :                  * writeout.  So in laptop mode, write out the whole world.
    2504                 :            :                  */
    2505                 :      76081 :                 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
    2506         [ +  + ]:      76081 :                 if (total_scanned > writeback_threshold) {
    2507         [ +  - ]:         21 :                         wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
    2508                 :            :                                                 WB_REASON_TRY_TO_FREE_PAGES);
    2509                 :         21 :                         sc->may_writepage = 1;
    2510                 :            :                 }
    2511 [ +  + ][ +  - ]:      76081 :         } while (--sc->priority >= 0 && !aborted_reclaim);
    2512                 :            : 
    2513                 :            : out:
    2514                 :            :         delayacct_freepages_end();
    2515                 :            : 
    2516            [ + ]:       5864 :         if (sc->nr_reclaimed)
    2517                 :            :                 return sc->nr_reclaimed;
    2518                 :            : 
    2519                 :            :         /*
    2520                 :            :          * As hibernation is going on, kswapd is freezed so that it can't mark
    2521                 :            :          * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
    2522                 :            :          * check.
    2523                 :            :          */
    2524         [ +  - ]:       5733 :         if (oom_killer_disabled)
    2525                 :            :                 return 0;
    2526                 :            : 
    2527                 :            :         /* Aborted reclaim to try compaction? don't OOM, then */
    2528         [ +  - ]:       5733 :         if (aborted_reclaim)
    2529                 :            :                 return 1;
    2530                 :            : 
    2531                 :            :         /* top priority shrink_zones still had more to do? don't OOM, then */
    2532         [ +  + ]:       5733 :         if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
    2533                 :            :                 return 1;
    2534                 :            : 
    2535                 :            :         return 0;
    2536                 :            : }
    2537                 :            : 
    2538                 :          0 : static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
    2539                 :            : {
    2540                 :            :         struct zone *zone;
    2541                 :            :         unsigned long pfmemalloc_reserve = 0;
    2542                 :            :         unsigned long free_pages = 0;
    2543                 :            :         int i;
    2544                 :            :         bool wmark_ok;
    2545                 :            : 
    2546         [ +  + ]:      11698 :         for (i = 0; i <= ZONE_NORMAL; i++) {
    2547                 :       5849 :                 zone = &pgdat->node_zones[i];
    2548                 :       5849 :                 pfmemalloc_reserve += min_wmark_pages(zone);
    2549                 :       5849 :                 free_pages += zone_page_state(zone, NR_FREE_PAGES);
    2550                 :            :         }
    2551                 :            : 
    2552                 :       5849 :         wmark_ok = free_pages > pfmemalloc_reserve / 2;
    2553                 :            : 
    2554                 :            :         /* kswapd must be awake if processes are being throttled */
    2555 [ -  + ][ #  # ]:       5849 :         if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
    2556                 :          0 :                 pgdat->classzone_idx = min(pgdat->classzone_idx,
    2557                 :            :                                                 (enum zone_type)ZONE_NORMAL);
    2558                 :          0 :                 wake_up_interruptible(&pgdat->kswapd_wait);
    2559                 :            :         }
    2560                 :            : 
    2561                 :          0 :         return wmark_ok;
    2562                 :            : }
    2563                 :            : 
    2564                 :            : /*
    2565                 :            :  * Throttle direct reclaimers if backing storage is backed by the network
    2566                 :            :  * and the PFMEMALLOC reserve for the preferred node is getting dangerously
    2567                 :            :  * depleted. kswapd will continue to make progress and wake the processes
    2568                 :            :  * when the low watermark is reached.
    2569                 :            :  *
    2570                 :            :  * Returns true if a fatal signal was delivered during throttling. If this
    2571                 :            :  * happens, the page allocator should not consider triggering the OOM killer.
    2572                 :            :  */
    2573                 :          0 : static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
    2574                 :            :                                         nodemask_t *nodemask)
    2575                 :            : {
    2576                 :            :         struct zone *zone;
    2577                 :            :         int high_zoneidx = gfp_zone(gfp_mask);
    2578                 :            :         pg_data_t *pgdat;
    2579                 :            : 
    2580                 :            :         /*
    2581                 :            :          * Kernel threads should not be throttled as they may be indirectly
    2582                 :            :          * responsible for cleaning pages necessary for reclaim to make forward
    2583                 :            :          * progress. kjournald for example may enter direct reclaim while
    2584                 :            :          * committing a transaction where throttling it could forcing other
    2585                 :            :          * processes to block on log_wait_commit().
    2586                 :            :          */
    2587         [ +  - ]:       5856 :         if (current->flags & PF_KTHREAD)
    2588                 :            :                 goto out;
    2589                 :            : 
    2590                 :            :         /*
    2591                 :            :          * If a fatal signal is pending, this process should not throttle.
    2592                 :            :          * It should return quickly so it can exit and free its memory
    2593                 :            :          */
    2594         [ +  + ]:       5856 :         if (fatal_signal_pending(current))
    2595                 :            :                 goto out;
    2596                 :            : 
    2597                 :            :         /* Check if the pfmemalloc reserves are ok */
    2598                 :            :         first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
    2599                 :       5847 :         pgdat = zone->zone_pgdat;
    2600         [ -  + ]:       5847 :         if (pfmemalloc_watermark_ok(pgdat))
    2601                 :            :                 goto out;
    2602                 :            : 
    2603                 :            :         /* Account for the throttling */
    2604                 :            :         count_vm_event(PGSCAN_DIRECT_THROTTLE);
    2605                 :            : 
    2606                 :            :         /*
    2607                 :            :          * If the caller cannot enter the filesystem, it's possible that it
    2608                 :            :          * is due to the caller holding an FS lock or performing a journal
    2609                 :            :          * transaction in the case of a filesystem like ext[3|4]. In this case,
    2610                 :            :          * it is not safe to block on pfmemalloc_wait as kswapd could be
    2611                 :            :          * blocked waiting on the same lock. Instead, throttle for up to a
    2612                 :            :          * second before continuing.
    2613                 :            :          */
    2614         [ #  # ]:          0 :         if (!(gfp_mask & __GFP_FS)) {
    2615 [ #  # ][ #  # ]:          0 :                 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
         [ #  # ][ #  # ]
    2616                 :            :                         pfmemalloc_watermark_ok(pgdat), HZ);
    2617                 :            : 
    2618                 :            :                 goto check_pending;
    2619                 :            :         }
    2620                 :            : 
    2621                 :            :         /* Throttle until kswapd wakes the process */
    2622 [ #  # ][ #  # ]:          0 :         wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
                 [ #  # ]
    2623                 :            :                 pfmemalloc_watermark_ok(pgdat));
    2624                 :            : 
    2625                 :            : check_pending:
    2626         [ #  # ]:          0 :         if (fatal_signal_pending(current))
    2627                 :            :                 return true;
    2628                 :            : 
    2629                 :            : out:
    2630                 :            :         return false;
    2631                 :            : }
    2632                 :            : 
    2633                 :          0 : unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
    2634                 :            :                                 gfp_t gfp_mask, nodemask_t *nodemask)
    2635                 :            : {
    2636                 :            :         unsigned long nr_reclaimed;
    2637                 :      17547 :         struct scan_control sc = {
    2638                 :            :                 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
    2639                 :       5849 :                 .may_writepage = !laptop_mode,
    2640                 :            :                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
    2641                 :            :                 .may_unmap = 1,
    2642                 :            :                 .may_swap = 1,
    2643                 :            :                 .order = order,
    2644                 :            :                 .priority = DEF_PRIORITY,
    2645                 :            :                 .target_mem_cgroup = NULL,
    2646                 :            :                 .nodemask = nodemask,
    2647                 :            :         };
    2648                 :       5849 :         struct shrink_control shrink = {
    2649                 :            :                 .gfp_mask = sc.gfp_mask,
    2650                 :            :         };
    2651                 :            : 
    2652                 :            :         /*
    2653                 :            :          * Do not enter reclaim if fatal signal was delivered while throttled.
    2654                 :            :          * 1 is returned so that the page allocator does not OOM kill at this
    2655                 :            :          * point.
    2656                 :            :          */
    2657         [ +  + ]:       5849 :         if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
    2658                 :            :                 return 1;
    2659                 :            : 
    2660                 :       5850 :         trace_mm_vmscan_direct_reclaim_begin(order,
    2661                 :            :                                 sc.may_writepage,
    2662                 :            :                                 gfp_mask);
    2663                 :            : 
    2664                 :       5850 :         nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
    2665                 :            : 
    2666                 :            :         trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
    2667                 :            : 
    2668                 :       5855 :         return nr_reclaimed;
    2669                 :            : }
    2670                 :            : 
    2671                 :            : #ifdef CONFIG_MEMCG
    2672                 :            : 
    2673                 :            : unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
    2674                 :            :                                                 gfp_t gfp_mask, bool noswap,
    2675                 :            :                                                 struct zone *zone,
    2676                 :            :                                                 unsigned long *nr_scanned)
    2677                 :            : {
    2678                 :            :         struct scan_control sc = {
    2679                 :            :                 .nr_scanned = 0,
    2680                 :            :                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
    2681                 :            :                 .may_writepage = !laptop_mode,
    2682                 :            :                 .may_unmap = 1,
    2683                 :            :                 .may_swap = !noswap,
    2684                 :            :                 .order = 0,
    2685                 :            :                 .priority = 0,
    2686                 :            :                 .target_mem_cgroup = memcg,
    2687                 :            :         };
    2688                 :            :         struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
    2689                 :            : 
    2690                 :            :         sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
    2691                 :            :                         (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
    2692                 :            : 
    2693                 :            :         trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
    2694                 :            :                                                       sc.may_writepage,
    2695                 :            :                                                       sc.gfp_mask);
    2696                 :            : 
    2697                 :            :         /*
    2698                 :            :          * NOTE: Although we can get the priority field, using it
    2699                 :            :          * here is not a good idea, since it limits the pages we can scan.
    2700                 :            :          * if we don't reclaim here, the shrink_zone from balance_pgdat
    2701                 :            :          * will pick up pages from other mem cgroup's as well. We hack
    2702                 :            :          * the priority and make it zero.
    2703                 :            :          */
    2704                 :            :         shrink_lruvec(lruvec, &sc);
    2705                 :            : 
    2706                 :            :         trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
    2707                 :            : 
    2708                 :            :         *nr_scanned = sc.nr_scanned;
    2709                 :            :         return sc.nr_reclaimed;
    2710                 :            : }
    2711                 :            : 
    2712                 :            : unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
    2713                 :            :                                            gfp_t gfp_mask,
    2714                 :            :                                            bool noswap)
    2715                 :            : {
    2716                 :            :         struct zonelist *zonelist;
    2717                 :            :         unsigned long nr_reclaimed;
    2718                 :            :         int nid;
    2719                 :            :         struct scan_control sc = {
    2720                 :            :                 .may_writepage = !laptop_mode,
    2721                 :            :                 .may_unmap = 1,
    2722                 :            :                 .may_swap = !noswap,
    2723                 :            :                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
    2724                 :            :                 .order = 0,
    2725                 :            :                 .priority = DEF_PRIORITY,
    2726                 :            :                 .target_mem_cgroup = memcg,
    2727                 :            :                 .nodemask = NULL, /* we don't care the placement */
    2728                 :            :                 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
    2729                 :            :                                 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
    2730                 :            :         };
    2731                 :            :         struct shrink_control shrink = {
    2732                 :            :                 .gfp_mask = sc.gfp_mask,
    2733                 :            :         };
    2734                 :            : 
    2735                 :            :         /*
    2736                 :            :          * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
    2737                 :            :          * take care of from where we get pages. So the node where we start the
    2738                 :            :          * scan does not need to be the current node.
    2739                 :            :          */
    2740                 :            :         nid = mem_cgroup_select_victim_node(memcg);
    2741                 :            : 
    2742                 :            :         zonelist = NODE_DATA(nid)->node_zonelists;
    2743                 :            : 
    2744                 :            :         trace_mm_vmscan_memcg_reclaim_begin(0,
    2745                 :            :                                             sc.may_writepage,
    2746                 :            :                                             sc.gfp_mask);
    2747                 :            : 
    2748                 :            :         nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
    2749                 :            : 
    2750                 :            :         trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
    2751                 :            : 
    2752                 :            :         return nr_reclaimed;
    2753                 :            : }
    2754                 :            : #endif
    2755                 :            : 
    2756                 :          0 : static void age_active_anon(struct zone *zone, struct scan_control *sc)
    2757                 :            : {
    2758                 :            :         struct mem_cgroup *memcg;
    2759                 :            : 
    2760         [ -  + ]:      31014 :         if (!total_swap_pages)
    2761                 :          0 :                 return;
    2762                 :            : 
    2763                 :            :         memcg = mem_cgroup_iter(NULL, NULL, NULL);
    2764                 :            :         do {
    2765                 :          0 :                 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
    2766                 :            : 
    2767            [ - ]:          0 :                 if (inactive_anon_is_low(lruvec))
    2768                 :          0 :                         shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
    2769                 :            :                                            sc, LRU_ACTIVE_ANON);
    2770                 :            : 
    2771                 :            :                 memcg = mem_cgroup_iter(NULL, memcg, NULL);
    2772                 :            :         } while (memcg);
    2773                 :            : }
    2774                 :            : 
    2775                 :          0 : static bool zone_balanced(struct zone *zone, int order,
    2776                 :            :                           unsigned long balance_gap, int classzone_idx)
    2777                 :            : {
    2778         [ +  + ]:     211223 :         if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
    2779                 :            :                                     balance_gap, classzone_idx, 0))
    2780                 :            :                 return false;
    2781                 :            : 
    2782   [ +  +  +  - ]:      53189 :         if (IS_ENABLED(CONFIG_COMPACTION) && order &&
    2783                 :       2788 :             !compaction_suitable(zone, order))
    2784                 :            :                 return false;
    2785                 :            : 
    2786                 :            :         return true;
    2787                 :            : }
    2788                 :            : 
    2789                 :            : /*
    2790                 :            :  * pgdat_balanced() is used when checking if a node is balanced.
    2791                 :            :  *
    2792                 :            :  * For order-0, all zones must be balanced!
    2793                 :            :  *
    2794                 :            :  * For high-order allocations only zones that meet watermarks and are in a
    2795                 :            :  * zone allowed by the callers classzone_idx are added to balanced_pages. The
    2796                 :            :  * total of balanced pages must be at least 25% of the zones allowed by
    2797                 :            :  * classzone_idx for the node to be considered balanced. Forcing all zones to
    2798                 :            :  * be balanced for high orders can cause excessive reclaim when there are
    2799                 :            :  * imbalanced zones.
    2800                 :            :  * The choice of 25% is due to
    2801                 :            :  *   o a 16M DMA zone that is balanced will not balance a zone on any
    2802                 :            :  *     reasonable sized machine
    2803                 :            :  *   o On all other machines, the top zone must be at least a reasonable
    2804                 :            :  *     percentage of the middle zones. For example, on 32-bit x86, highmem
    2805                 :            :  *     would need to be at least 256M for it to be balance a whole node.
    2806                 :            :  *     Similarly, on x86-64 the Normal zone would need to be at least 1G
    2807                 :            :  *     to balance a node on its own. These seemed like reasonable ratios.
    2808                 :            :  */
    2809                 :          0 : static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
    2810                 :            : {
    2811                 :            :         unsigned long managed_pages = 0;
    2812                 :            :         unsigned long balanced_pages = 0;
    2813                 :            :         int i;
    2814                 :            : 
    2815                 :            :         /* Check the watermark levels */
    2816         [ +  + ]:      71555 :         for (i = 0; i <= classzone_idx; i++) {
    2817                 :      69251 :                 struct zone *zone = pgdat->node_zones + i;
    2818                 :            : 
    2819         [ -  + ]:      69251 :                 if (!populated_zone(zone))
    2820                 :          0 :                         continue;
    2821                 :            : 
    2822                 :      69251 :                 managed_pages += zone->managed_pages;
    2823                 :            : 
    2824                 :            :                 /*
    2825                 :            :                  * A special case here:
    2826                 :            :                  *
    2827                 :            :                  * balance_pgdat() skips over all_unreclaimable after
    2828                 :            :                  * DEF_PRIORITY. Effectively, it considers them balanced so
    2829                 :            :                  * they must be considered balanced here as well!
    2830                 :            :                  */
    2831         [ +  + ]:      69251 :                 if (!zone_reclaimable(zone)) {
    2832                 :      14906 :                         balanced_pages += zone->managed_pages;
    2833                 :      14906 :                         continue;
    2834                 :            :                 }
    2835                 :            : 
    2836         [ +  + ]:      54345 :                 if (zone_balanced(zone, order, 0, i))
    2837                 :      22022 :                         balanced_pages += zone->managed_pages;
    2838         [ +  + ]:      32323 :                 else if (!order)
    2839                 :            :                         return false;
    2840                 :            :         }
    2841                 :            : 
    2842         [ +  + ]:       2304 :         if (order)
    2843                 :          2 :                 return balanced_pages >= (managed_pages >> 2);
    2844                 :            :         else
    2845                 :            :                 return true;
    2846                 :            : }
    2847                 :            : 
    2848                 :            : /*
    2849                 :            :  * Prepare kswapd for sleeping. This verifies that there are no processes
    2850                 :            :  * waiting in throttle_direct_reclaim() and that watermarks have been met.
    2851                 :            :  *
    2852                 :            :  * Returns true if kswapd is ready to sleep
    2853                 :            :  */
    2854                 :          0 : static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
    2855                 :            :                                         int classzone_idx)
    2856                 :            : {
    2857                 :            :         /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
    2858         [ +  + ]:       7212 :         if (remaining)
    2859                 :            :                 return false;
    2860                 :            : 
    2861                 :            :         /*
    2862                 :            :          * There is a potential race between when kswapd checks its watermarks
    2863                 :            :          * and a process gets throttled. There is also a potential race if
    2864                 :            :          * processes get throttled, kswapd wakes, a large process exits therby
    2865                 :            :          * balancing the zones that causes kswapd to miss a wakeup. If kswapd
    2866                 :            :          * is going to sleep, no process should be sleeping on pfmemalloc_wait
    2867                 :            :          * so wake them now if necessary. If necessary, processes will wake
    2868                 :            :          * kswapd and get throttled again
    2869                 :            :          */
    2870         [ -  + ]:       6073 :         if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
    2871                 :          0 :                 wake_up(&pgdat->pfmemalloc_wait);
    2872                 :          0 :                 return false;
    2873                 :            :         }
    2874                 :            : 
    2875                 :       6073 :         return pgdat_balanced(pgdat, order, classzone_idx);
    2876                 :            : }
    2877                 :            : 
    2878                 :            : /*
    2879                 :            :  * kswapd shrinks the zone by the number of pages required to reach
    2880                 :            :  * the high watermark.
    2881                 :            :  *
    2882                 :            :  * Returns true if kswapd scanned at least the requested number of pages to
    2883                 :            :  * reclaim or if the lack of progress was due to pages under writeback.
    2884                 :            :  * This is used to determine if the scanning priority needs to be raised.
    2885                 :            :  */
    2886                 :          0 : static bool kswapd_shrink_zone(struct zone *zone,
    2887                 :            :                                int classzone_idx,
    2888                 :            :                                struct scan_control *sc,
    2889                 :            :                                unsigned long lru_pages,
    2890                 :            :                                unsigned long *nr_attempted)
    2891                 :            : {
    2892                 :      52574 :         int testorder = sc->order;
    2893                 :            :         unsigned long balance_gap;
    2894                 :      52574 :         struct reclaim_state *reclaim_state = current->reclaim_state;
    2895                 :     105148 :         struct shrink_control shrink = {
    2896                 :      52574 :                 .gfp_mask = sc->gfp_mask,
    2897                 :            :         };
    2898                 :            :         bool lowmem_pressure;
    2899                 :            : 
    2900                 :            :         /* Reclaim above the high watermark. */
    2901                 :      52574 :         sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
    2902                 :            : 
    2903                 :            :         /*
    2904                 :            :          * Kswapd reclaims only single pages with compaction enabled. Trying
    2905                 :            :          * too hard to reclaim until contiguous free pages have become
    2906                 :            :          * available can hurt performance by evicting too much useful data
    2907                 :            :          * from memory. Do not reclaim more than needed for compaction.
    2908                 :            :          */
    2909   [ +  +  +  - ]:      52576 :         if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
    2910                 :          2 :                         compaction_suitable(zone, sc->order) !=
    2911                 :            :                                 COMPACT_SKIPPED)
    2912                 :            :                 testorder = 0;
    2913                 :            : 
    2914                 :            :         /*
    2915                 :            :          * We put equal pressure on every zone, unless one zone has way too
    2916                 :            :          * many pages free already. The "too many pages" is defined as the
    2917                 :            :          * high wmark plus a "gap" where the gap is either the low
    2918                 :            :          * watermark or 1% of the zone, whichever is smaller.
    2919                 :            :          */
    2920                 :      52574 :         balance_gap = min(low_wmark_pages(zone),
    2921                 :            :                 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
    2922                 :            :                 KSWAPD_ZONE_BALANCE_GAP_RATIO);
    2923                 :            : 
    2924                 :            :         /*
    2925                 :            :          * If there is no low memory pressure or the zone is balanced then no
    2926                 :            :          * reclaim is necessary
    2927                 :            :          */
    2928 [ -  + ][ #  # ]:      52574 :         lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
    2929 [ +  - ][ +  + ]:      52574 :         if (!lowmem_pressure && zone_balanced(zone, testorder,
    2930                 :            :                                                 balance_gap, classzone_idx))
    2931                 :            :                 return true;
    2932                 :            : 
    2933                 :      49684 :         shrink_zone(zone, sc);
    2934                 :            :         nodes_clear(shrink.nodes_to_scan);
    2935                 :            :         node_set(zone_to_nid(zone), shrink.nodes_to_scan);
    2936                 :            : 
    2937                 :      49684 :         reclaim_state->reclaimed_slab = 0;
    2938                 :      49684 :         shrink_slab(&shrink, sc->nr_scanned, lru_pages);
    2939                 :      49684 :         sc->nr_reclaimed += reclaim_state->reclaimed_slab;
    2940                 :            : 
    2941                 :            :         /* Account for the number of pages attempted to reclaim */
    2942                 :      49684 :         *nr_attempted += sc->nr_to_reclaim;
    2943                 :            : 
    2944                 :            :         zone_clear_flag(zone, ZONE_WRITEBACK);
    2945                 :            : 
    2946                 :            :         /*
    2947                 :            :          * If a zone reaches its high watermark, consider it to be no longer
    2948                 :            :          * congested. It's possible there are dirty pages backed by congested
    2949                 :            :          * BDIs but as pressure is relieved, speculatively avoid congestion
    2950                 :            :          * waits.
    2951                 :            :          */
    2952   [ +  +  +  + ]:      96641 :         if (zone_reclaimable(zone) &&
    2953                 :      46957 :             zone_balanced(zone, testorder, 0, classzone_idx)) {
    2954                 :            :                 zone_clear_flag(zone, ZONE_CONGESTED);
    2955                 :            :                 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
    2956                 :            :         }
    2957                 :            : 
    2958                 :      49684 :         return sc->nr_scanned >= sc->nr_to_reclaim;
    2959                 :            : }
    2960                 :            : 
    2961                 :            : /*
    2962                 :            :  * For kswapd, balance_pgdat() will work across all this node's zones until
    2963                 :            :  * they are all at high_wmark_pages(zone).
    2964                 :            :  *
    2965                 :            :  * Returns the final order kswapd was reclaiming at
    2966                 :            :  *
    2967                 :            :  * There is special handling here for zones which are full of pinned pages.
    2968                 :            :  * This can happen if the pages are all mlocked, or if they are all used by
    2969                 :            :  * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
    2970                 :            :  * What we do is to detect the case where all pages in the zone have been
    2971                 :            :  * scanned twice and there has been zero successful reclaim.  Mark the zone as
    2972                 :            :  * dead and from now on, only perform a short scan.  Basically we're polling
    2973                 :            :  * the zone for when the problem goes away.
    2974                 :            :  *
    2975                 :            :  * kswapd scans the zones in the highmem->normal->dma direction.  It skips
    2976                 :            :  * zones which have free_pages > high_wmark_pages(zone), but once a zone is
    2977                 :            :  * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
    2978                 :            :  * lower zones regardless of the number of free pages in the lower zones. This
    2979                 :            :  * interoperates with the page allocator fallback scheme to ensure that aging
    2980                 :            :  * of pages is balanced across the zones.
    2981                 :            :  */
    2982                 :          0 : static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
    2983                 :            :                                                         int *classzone_idx)
    2984                 :            : {
    2985                 :            :         int i;
    2986                 :            :         int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
    2987                 :            :         unsigned long nr_soft_reclaimed;
    2988                 :            :         unsigned long nr_soft_scanned;
    2989                 :       7212 :         struct scan_control sc = {
    2990                 :            :                 .gfp_mask = GFP_KERNEL,
    2991                 :            :                 .priority = DEF_PRIORITY,
    2992                 :            :                 .may_unmap = 1,
    2993                 :            :                 .may_swap = 1,
    2994                 :       3606 :                 .may_writepage = !laptop_mode,
    2995                 :            :                 .order = order,
    2996                 :            :                 .target_mem_cgroup = NULL,
    2997                 :            :         };
    2998                 :            :         count_vm_event(PAGEOUTRUN);
    2999                 :            : 
    3000                 :            :         do {
    3001                 :            :                 unsigned long lru_pages = 0;
    3002                 :      31014 :                 unsigned long nr_attempted = 0;
    3003                 :            :                 bool raise_priority = true;
    3004                 :      31014 :                 bool pgdat_needs_compaction = (order > 0);
    3005                 :            : 
    3006                 :      31014 :                 sc.nr_reclaimed = 0;
    3007                 :            : 
    3008                 :            :                 /*
    3009                 :            :                  * Scan in the highmem->dma direction for the highest
    3010                 :            :                  * zone which needs scanning
    3011                 :            :                  */
    3012         [ +  - ]:      31014 :                 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
    3013                 :      31014 :                         struct zone *zone = pgdat->node_zones + i;
    3014                 :            : 
    3015         [ -  + ]:      31014 :                         if (!populated_zone(zone))
    3016                 :          0 :                                 continue;
    3017                 :            : 
    3018   [ +  +  -  + ]:      58419 :                         if (sc.priority != DEF_PRIORITY &&
    3019                 :            :                             !zone_reclaimable(zone))
    3020                 :          0 :                                 continue;
    3021                 :            : 
    3022                 :            :                         /*
    3023                 :            :                          * Do some background aging of the anon list, to give
    3024                 :            :                          * pages a chance to be referenced before reclaiming.
    3025                 :            :                          */
    3026                 :      31014 :                         age_active_anon(zone, &sc);
    3027                 :            : 
    3028                 :            :                         /*
    3029                 :            :                          * If the number of buffer_heads in the machine
    3030                 :            :                          * exceeds the maximum allowed level and this node
    3031                 :            :                          * has a highmem zone, force kswapd to reclaim from
    3032                 :            :                          * it to relieve lowmem pressure.
    3033                 :            :                          */
    3034 [ -  + ][ #  # ]:      31014 :                         if (buffer_heads_over_limit && is_highmem_idx(i)) {
    3035                 :            :                                 end_zone = i;
    3036                 :            :                                 break;
    3037                 :            :                         }
    3038                 :            : 
    3039         [ -  + ]:      31014 :                         if (!zone_balanced(zone, order, 0, 0)) {
    3040                 :            :                                 end_zone = i;
    3041                 :            :                                 break;
    3042                 :            :                         } else {
    3043                 :            :                                 /*
    3044                 :            :                                  * If balanced, clear the dirty and congested
    3045                 :            :                                  * flags
    3046                 :            :                                  */
    3047                 :            :                                 zone_clear_flag(zone, ZONE_CONGESTED);
    3048                 :            :                                 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
    3049                 :            :                         }
    3050                 :            :                 }
    3051                 :            : 
    3052         [ +  - ]:      31014 :                 if (i < 0)
    3053                 :            :                         goto out;
    3054                 :            : 
    3055         [ +  + ]:      93042 :                 for (i = 0; i <= end_zone; i++) {
    3056                 :      62028 :                         struct zone *zone = pgdat->node_zones + i;
    3057                 :            : 
    3058         [ -  + ]:      62028 :                         if (!populated_zone(zone))
    3059                 :          0 :                                 continue;
    3060                 :            : 
    3061                 :      62028 :                         lru_pages += zone_reclaimable_pages(zone);
    3062                 :            : 
    3063                 :            :                         /*
    3064                 :            :                          * If any zone is currently balanced then kswapd will
    3065                 :            :                          * not call compaction as it is expected that the
    3066                 :            :                          * necessary pages are already available.
    3067                 :            :                          */
    3068   [ +  +  +  - ]:      62029 :                         if (pgdat_needs_compaction &&
    3069                 :          1 :                                         zone_watermark_ok(zone, order,
    3070                 :            :                                                 low_wmark_pages(zone),
    3071                 :            :                                                 *classzone_idx, 0))
    3072                 :            :                                 pgdat_needs_compaction = false;
    3073                 :            :                 }
    3074                 :            : 
    3075                 :            :                 /*
    3076                 :            :                  * If we're getting trouble reclaiming, start doing writepage
    3077                 :            :                  * even in laptop mode.
    3078                 :            :                  */
    3079         [ +  + ]:      31014 :                 if (sc.priority < DEF_PRIORITY - 2)
    3080                 :      31014 :                         sc.may_writepage = 1;
    3081                 :            : 
    3082                 :            :                 /*
    3083                 :            :                  * Now scan the zone in the dma->highmem direction, stopping
    3084                 :            :                  * at the last zone which needs scanning.
    3085                 :            :                  *
    3086                 :            :                  * We do this because the page allocator works in the opposite
    3087                 :            :                  * direction.  This prevents the page allocator from allocating
    3088                 :            :                  * pages behind kswapd's direction of progress, which would
    3089                 :            :                  * cause too much scanning of the lower zones.
    3090                 :            :                  */
    3091         [ +  + ]:      93042 :                 for (i = 0; i <= end_zone; i++) {
    3092                 :      62028 :                         struct zone *zone = pgdat->node_zones + i;
    3093                 :            : 
    3094         [ -  + ]:      62028 :                         if (!populated_zone(zone))
    3095                 :          0 :                                 continue;
    3096                 :            : 
    3097   [ +  +  +  + ]:     116838 :                         if (sc.priority != DEF_PRIORITY &&
    3098                 :            :                             !zone_reclaimable(zone))
    3099                 :       9454 :                                 continue;
    3100                 :            : 
    3101                 :      52574 :                         sc.nr_scanned = 0;
    3102                 :            : 
    3103                 :            :                         nr_soft_scanned = 0;
    3104                 :            :                         /*
    3105                 :            :                          * Call soft limit reclaim before calling shrink_zone.
    3106                 :            :                          */
    3107                 :            :                         nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
    3108                 :            :                                                         order, sc.gfp_mask,
    3109                 :            :                                                         &nr_soft_scanned);
    3110                 :            :                         sc.nr_reclaimed += nr_soft_reclaimed;
    3111                 :            : 
    3112                 :            :                         /*
    3113                 :            :                          * There should be no need to raise the scanning
    3114                 :            :                          * priority if enough pages are already being scanned
    3115                 :            :                          * that that high watermark would be met at 100%
    3116                 :            :                          * efficiency.
    3117                 :            :                          */
    3118         [ +  + ]:      52574 :                         if (kswapd_shrink_zone(zone, end_zone, &sc,
    3119                 :            :                                         lru_pages, &nr_attempted))
    3120                 :            :                                 raise_priority = false;
    3121                 :            :                 }
    3122                 :            : 
    3123                 :            :                 /*
    3124                 :            :                  * If the low watermark is met there is no need for processes
    3125                 :            :                  * to be throttled on pfmemalloc_wait as they should not be
    3126                 :            :                  * able to safely make forward progress. Wake them
    3127                 :            :                  */
    3128   [ -  +  #  # ]:      31014 :                 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
    3129                 :          0 :                                 pfmemalloc_watermark_ok(pgdat))
    3130                 :          0 :                         wake_up(&pgdat->pfmemalloc_wait);
    3131                 :            : 
    3132                 :            :                 /*
    3133                 :            :                  * Fragmentation may mean that the system cannot be rebalanced
    3134                 :            :                  * for high-order allocations in all zones. If twice the
    3135                 :            :                  * allocation size has been reclaimed and the zones are still
    3136                 :            :                  * not balanced then recheck the watermarks at order-0 to
    3137                 :            :                  * prevent kswapd reclaiming excessively. Assume that a
    3138                 :            :                  * process requested a high-order can direct reclaim/compact.
    3139                 :            :                  */
    3140 [ +  + ][ -  + ]:      31014 :                 if (order && sc.nr_reclaimed >= 2UL << order)
    3141                 :          0 :                         order = sc.order = 0;
    3142                 :            : 
    3143                 :            :                 /* Check if kswapd should be suspending */
    3144 [ +  - ][ +  - ]:      31014 :                 if (try_to_freeze() || kthread_should_stop())
    3145                 :            :                         break;
    3146                 :            : 
    3147                 :            :                 /*
    3148                 :            :                  * Compact if necessary and kswapd is reclaiming at least the
    3149                 :            :                  * high watermark number of pages as requsted
    3150                 :            :                  */
    3151 [ -  + ][ #  # ]:      31014 :                 if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
    3152                 :          0 :                         compact_pgdat(pgdat, order);
    3153                 :            : 
    3154                 :            :                 /*
    3155                 :            :                  * Raise priority if scanning rate is too low or there was no
    3156                 :            :                  * progress in reclaiming pages
    3157                 :            :                  */
    3158 [ +  + ][ +  + ]:      31014 :                 if (raise_priority || !sc.nr_reclaimed)
    3159                 :      30854 :                         sc.priority--;
    3160         [ +  + ]:      28553 :         } while (sc.priority >= 1 &&
    3161         [ +  + ]:      31014 :                  !pgdat_balanced(pgdat, order, *classzone_idx));
    3162                 :            : 
    3163                 :            : out:
    3164                 :            :         /*
    3165                 :            :          * Return the order we were reclaiming at so prepare_kswapd_sleep()
    3166                 :            :          * makes a decision on the order we were last reclaiming at. However,
    3167                 :            :          * if another caller entered the allocator slow path while kswapd
    3168                 :            :          * was awake, order will remain at the higher level
    3169                 :            :          */
    3170                 :       3606 :         *classzone_idx = end_zone;
    3171                 :       3606 :         return order;
    3172                 :            : }
    3173                 :            : 
    3174                 :          0 : static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
    3175                 :            : {
    3176                 :            :         long remaining = 0;
    3177                 :       7212 :         DEFINE_WAIT(wait);
    3178                 :            : 
    3179 [ +  - ][ +  - ]:       3606 :         if (freezing(current) || kthread_should_stop())
    3180                 :          0 :                 return;
    3181                 :            : 
    3182                 :       3606 :         prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
    3183                 :            : 
    3184                 :            :         /* Try to sleep for a short interval */
    3185         [ +  + ]:       3606 :         if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
    3186                 :       1149 :                 remaining = schedule_timeout(HZ/10);
    3187                 :       1149 :                 finish_wait(&pgdat->kswapd_wait, &wait);
    3188                 :       1149 :                 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
    3189                 :            :         }
    3190                 :            : 
    3191                 :            :         /*
    3192                 :            :          * After a short sleep, check if it was a premature sleep. If not, then
    3193                 :            :          * go fully to sleep until explicitly woken up.
    3194                 :            :          */
    3195         [ +  + ]:       3606 :         if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
    3196                 :         10 :                 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
    3197                 :            : 
    3198                 :            :                 /*
    3199                 :            :                  * vmstat counters are not perfectly accurate and the estimated
    3200                 :            :                  * value for counters such as NR_FREE_PAGES can deviate from the
    3201                 :            :                  * true value by nr_online_cpus * threshold. To avoid the zone
    3202                 :            :                  * watermarks being breached while under pressure, we reduce the
    3203                 :            :                  * per-cpu vmstat threshold while kswapd is awake and restore
    3204                 :            :                  * them before going back to sleep.
    3205                 :            :                  */
    3206                 :         10 :                 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
    3207                 :            : 
    3208                 :            :                 /*
    3209                 :            :                  * Compaction records what page blocks it recently failed to
    3210                 :            :                  * isolate pages from and skips them in the future scanning.
    3211                 :            :                  * When kswapd is going to sleep, it is reasonable to assume
    3212                 :            :                  * that pages and compaction may succeed so reset the cache.
    3213                 :            :                  */
    3214                 :         10 :                 reset_isolation_suitable(pgdat);
    3215                 :            : 
    3216         [ +  - ]:         10 :                 if (!kthread_should_stop())
    3217                 :         10 :                         schedule();
    3218                 :            : 
    3219                 :         10 :                 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
    3220                 :            :         } else {
    3221         [ +  + ]:       3596 :                 if (remaining)
    3222                 :            :                         count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
    3223                 :            :                 else
    3224                 :            :                         count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
    3225                 :            :         }
    3226                 :       3606 :         finish_wait(&pgdat->kswapd_wait, &wait);
    3227                 :            : }
    3228                 :            : 
    3229                 :            : /*
    3230                 :            :  * The background pageout daemon, started as a kernel thread
    3231                 :            :  * from the init process.
    3232                 :            :  *
    3233                 :            :  * This basically trickles out pages so that we have _some_
    3234                 :            :  * free memory available even if there is no other activity
    3235                 :            :  * that frees anything up. This is needed for things like routing
    3236                 :            :  * etc, where we otherwise might have all activity going on in
    3237                 :            :  * asynchronous contexts that cannot page things out.
    3238                 :            :  *
    3239                 :            :  * If there are applications that are active memory-allocators
    3240                 :            :  * (most normal use), this basically shouldn't matter.
    3241                 :            :  */
    3242                 :          0 : static int kswapd(void *p)
    3243                 :            : {
    3244                 :            :         unsigned long order, new_order;
    3245                 :            :         unsigned balanced_order;
    3246                 :            :         int classzone_idx, new_classzone_idx;
    3247                 :            :         int balanced_classzone_idx;
    3248                 :            :         pg_data_t *pgdat = (pg_data_t*)p;
    3249                 :          0 :         struct task_struct *tsk = current;
    3250                 :            : 
    3251                 :          0 :         struct reclaim_state reclaim_state = {
    3252                 :            :                 .reclaimed_slab = 0,
    3253                 :            :         };
    3254                 :          0 :         const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
    3255                 :            : 
    3256                 :            :         lockdep_set_current_reclaim_state(GFP_KERNEL);
    3257                 :            : 
    3258         [ #  # ]:          0 :         if (!cpumask_empty(cpumask))
    3259                 :          0 :                 set_cpus_allowed_ptr(tsk, cpumask);
    3260                 :          0 :         current->reclaim_state = &reclaim_state;
    3261                 :            : 
    3262                 :            :         /*
    3263                 :            :          * Tell the memory management that we're a "memory allocator",
    3264                 :            :          * and that if we need more memory we should get access to it
    3265                 :            :          * regardless (see "__alloc_pages()"). "kswapd" should
    3266                 :            :          * never get caught in the normal page freeing logic.
    3267                 :            :          *
    3268                 :            :          * (Kswapd normally doesn't need memory anyway, but sometimes
    3269                 :            :          * you need a small amount of memory in order to be able to
    3270                 :            :          * page out something else, and this flag essentially protects
    3271                 :            :          * us from recursively trying to free more memory as we're
    3272                 :            :          * trying to free the first piece of memory in the first place).
    3273                 :            :          */
    3274                 :          0 :         tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
    3275                 :          0 :         set_freezable();
    3276                 :            : 
    3277                 :            :         order = new_order = 0;
    3278                 :            :         balanced_order = 0;
    3279                 :          0 :         classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
    3280                 :       3606 :         balanced_classzone_idx = classzone_idx;
    3281                 :            :         for ( ; ; ) {
    3282                 :            :                 bool ret;
    3283                 :            : 
    3284                 :            :                 /*
    3285                 :            :                  * If the last balance_pgdat was unsuccessful it's unlikely a
    3286                 :            :                  * new request of a similar or harder type will succeed soon
    3287                 :            :                  * so consider going to sleep on the basis we reclaimed at
    3288                 :            :                  */
    3289         [ +  - ]:       3606 :                 if (balanced_classzone_idx >= new_classzone_idx &&
    3290                 :       3606 :                                         balanced_order == new_order) {
    3291                 :       3606 :                         new_order = pgdat->kswapd_max_order;
    3292                 :       3606 :                         new_classzone_idx = pgdat->classzone_idx;
    3293                 :       3606 :                         pgdat->kswapd_max_order =  0;
    3294                 :       3606 :                         pgdat->classzone_idx = pgdat->nr_zones - 1;
    3295                 :            :                 }
    3296                 :            : 
    3297         [ +  - ]:       3606 :                 if (order < new_order || classzone_idx > new_classzone_idx) {
    3298                 :            :                         /*
    3299                 :            :                          * Don't sleep if someone wants a larger 'order'
    3300                 :            :                          * allocation or has tigher zone constraints
    3301                 :            :                          */
    3302                 :            :                         order = new_order;
    3303                 :            :                         classzone_idx = new_classzone_idx;
    3304                 :            :                 } else {
    3305                 :       3606 :                         kswapd_try_to_sleep(pgdat, balanced_order,
    3306                 :            :                                                 balanced_classzone_idx);
    3307                 :       3606 :                         order = pgdat->kswapd_max_order;
    3308                 :       3606 :                         classzone_idx = pgdat->classzone_idx;
    3309                 :            :                         new_order = order;
    3310                 :            :                         new_classzone_idx = classzone_idx;
    3311                 :       3606 :                         pgdat->kswapd_max_order = 0;
    3312                 :       3606 :                         pgdat->classzone_idx = pgdat->nr_zones - 1;
    3313                 :            :                 }
    3314                 :            : 
    3315                 :            :                 ret = try_to_freeze();
    3316         [ +  - ]:       3606 :                 if (kthread_should_stop())
    3317                 :            :                         break;
    3318                 :            : 
    3319                 :            :                 /*
    3320                 :            :                  * We can speed up thawing tasks if we don't call balance_pgdat
    3321                 :            :                  * after returning from the refrigerator
    3322                 :            :                  */
    3323         [ -  + ]:       3606 :                 if (!ret) {
    3324                 :       3606 :                         trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
    3325                 :       3606 :                         balanced_classzone_idx = classzone_idx;
    3326                 :       3606 :                         balanced_order = balance_pgdat(pgdat, order,
    3327                 :            :                                                 &balanced_classzone_idx);
    3328                 :            :                 }
    3329                 :            :         }
    3330                 :            : 
    3331                 :          0 :         current->reclaim_state = NULL;
    3332                 :          0 :         return 0;
    3333                 :            : }
    3334                 :            : 
    3335                 :            : /*
    3336                 :            :  * A zone is low on free memory, so wake its kswapd task to service it.
    3337                 :            :  */
    3338                 :          0 : void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
    3339                 :            : {
    3340                 :            :         pg_data_t *pgdat;
    3341                 :            : 
    3342         [ +  + ]:      69302 :         if (!populated_zone(zone))
    3343                 :            :                 return;
    3344                 :            : 
    3345                 :            :         if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    3346                 :            :                 return;
    3347                 :      69289 :         pgdat = zone->zone_pgdat;
    3348         [ +  + ]:      69289 :         if (pgdat->kswapd_max_order < order) {
    3349                 :          2 :                 pgdat->kswapd_max_order = order;
    3350                 :          2 :                 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
    3351                 :            :         }
    3352         [ +  + ]:      69289 :         if (!waitqueue_active(&pgdat->kswapd_wait))
    3353                 :            :                 return;
    3354         [ +  + ]:      26329 :         if (zone_balanced(zone, order, 0, 0))
    3355                 :            :                 return;
    3356                 :            : 
    3357                 :       1186 :         trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
    3358                 :       1186 :         wake_up_interruptible(&pgdat->kswapd_wait);
    3359                 :            : }
    3360                 :            : 
    3361                 :            : #ifdef CONFIG_HIBERNATION
    3362                 :            : /*
    3363                 :            :  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
    3364                 :            :  * freed pages.
    3365                 :            :  *
    3366                 :            :  * Rather than trying to age LRUs the aim is to preserve the overall
    3367                 :            :  * LRU order by reclaiming preferentially
    3368                 :            :  * inactive > active > active referenced > active mapped
    3369                 :            :  */
    3370                 :            : unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
    3371                 :            : {
    3372                 :            :         struct reclaim_state reclaim_state;
    3373                 :            :         struct scan_control sc = {
    3374                 :            :                 .gfp_mask = GFP_HIGHUSER_MOVABLE,
    3375                 :            :                 .may_swap = 1,
    3376                 :            :                 .may_unmap = 1,
    3377                 :            :                 .may_writepage = 1,
    3378                 :            :                 .nr_to_reclaim = nr_to_reclaim,
    3379                 :            :                 .hibernation_mode = 1,
    3380                 :            :                 .order = 0,
    3381                 :            :                 .priority = DEF_PRIORITY,
    3382                 :            :         };
    3383                 :            :         struct shrink_control shrink = {
    3384                 :            :                 .gfp_mask = sc.gfp_mask,
    3385                 :            :         };
    3386                 :            :         struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
    3387                 :            :         struct task_struct *p = current;
    3388                 :            :         unsigned long nr_reclaimed;
    3389                 :            : 
    3390                 :            :         p->flags |= PF_MEMALLOC;
    3391                 :            :         lockdep_set_current_reclaim_state(sc.gfp_mask);
    3392                 :            :         reclaim_state.reclaimed_slab = 0;
    3393                 :            :         p->reclaim_state = &reclaim_state;
    3394                 :            : 
    3395                 :            :         nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
    3396                 :            : 
    3397                 :            :         p->reclaim_state = NULL;
    3398                 :            :         lockdep_clear_current_reclaim_state();
    3399                 :            :         p->flags &= ~PF_MEMALLOC;
    3400                 :            : 
    3401                 :            :         return nr_reclaimed;
    3402                 :            : }
    3403                 :            : #endif /* CONFIG_HIBERNATION */
    3404                 :            : 
    3405                 :            : /* It's optimal to keep kswapds on the same CPUs as their memory, but
    3406                 :            :    not required for correctness.  So if the last cpu in a node goes
    3407                 :            :    away, we get changed to run anywhere: as the first one comes back,
    3408                 :            :    restore their cpu bindings. */
    3409                 :          0 : static int cpu_callback(struct notifier_block *nfb, unsigned long action,
    3410                 :            :                         void *hcpu)
    3411                 :            : {
    3412                 :            :         int nid;
    3413                 :            : 
    3414         [ +  + ]:        555 :         if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
    3415         [ +  + ]:        162 :                 for_each_node_state(nid, N_MEMORY) {
    3416                 :            :                         pg_data_t *pgdat = NODE_DATA(nid);
    3417                 :            :                         const struct cpumask *mask;
    3418                 :            : 
    3419                 :         81 :                         mask = cpumask_of_node(pgdat->node_id);
    3420                 :            : 
    3421         [ -  + ]:         81 :                         if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
    3422                 :            :                                 /* One of our CPUs online: restore mask */
    3423                 :        162 :                                 set_cpus_allowed_ptr(pgdat->kswapd, mask);
    3424                 :            :                 }
    3425                 :            :         }
    3426                 :        555 :         return NOTIFY_OK;
    3427                 :            : }
    3428                 :            : 
    3429                 :            : /*
    3430                 :            :  * This kswapd start function will be called by init and node-hot-add.
    3431                 :            :  * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
    3432                 :            :  */
    3433                 :          0 : int kswapd_run(int nid)
    3434                 :            : {
    3435                 :            :         pg_data_t *pgdat = NODE_DATA(nid);
    3436                 :            :         int ret = 0;
    3437                 :            : 
    3438         [ #  # ]:          0 :         if (pgdat->kswapd)
    3439                 :            :                 return 0;
    3440                 :            : 
    3441         [ #  # ]:          0 :         pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
    3442         [ #  # ]:          0 :         if (IS_ERR(pgdat->kswapd)) {
    3443                 :            :                 /* failure at boot is fatal */
    3444         [ #  # ]:          0 :                 BUG_ON(system_state == SYSTEM_BOOTING);
    3445                 :          0 :                 pr_err("Failed to start kswapd on node %d\n", nid);
    3446                 :          0 :                 ret = PTR_ERR(pgdat->kswapd);
    3447                 :          0 :                 pgdat->kswapd = NULL;
    3448                 :            :         }
    3449                 :          0 :         return ret;
    3450                 :            : }
    3451                 :            : 
    3452                 :            : /*
    3453                 :            :  * Called by memory hotplug when all memory in a node is offlined.  Caller must
    3454                 :            :  * hold lock_memory_hotplug().
    3455                 :            :  */
    3456                 :          0 : void kswapd_stop(int nid)
    3457                 :            : {
    3458                 :          0 :         struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
    3459                 :            : 
    3460         [ #  # ]:          0 :         if (kswapd) {
    3461                 :          0 :                 kthread_stop(kswapd);
    3462                 :          0 :                 NODE_DATA(nid)->kswapd = NULL;
    3463                 :            :         }
    3464                 :          0 : }
    3465                 :            : 
    3466                 :          0 : static int __init kswapd_init(void)
    3467                 :            : {
    3468                 :            :         int nid;
    3469                 :            : 
    3470                 :          0 :         swap_setup();
    3471         [ #  # ]:          0 :         for_each_node_state(nid, N_MEMORY)
    3472                 :          0 :                 kswapd_run(nid);
    3473                 :          0 :         hotcpu_notifier(cpu_callback, 0);
    3474                 :          0 :         return 0;
    3475                 :            : }
    3476                 :            : 
    3477                 :            : module_init(kswapd_init)
    3478                 :            : 
    3479                 :            : #ifdef CONFIG_NUMA
    3480                 :            : /*
    3481                 :            :  * Zone reclaim mode
    3482                 :            :  *
    3483                 :            :  * If non-zero call zone_reclaim when the number of free pages falls below
    3484                 :            :  * the watermarks.
    3485                 :            :  */
    3486                 :            : int zone_reclaim_mode __read_mostly;
    3487                 :            : 
    3488                 :            : #define RECLAIM_OFF 0
    3489                 :            : #define RECLAIM_ZONE (1<<0)       /* Run shrink_inactive_list on the zone */
    3490                 :            : #define RECLAIM_WRITE (1<<1)      /* Writeout pages during reclaim */
    3491                 :            : #define RECLAIM_SWAP (1<<2)       /* Swap pages out during reclaim */
    3492                 :            : 
    3493                 :            : /*
    3494                 :            :  * Priority for ZONE_RECLAIM. This determines the fraction of pages
    3495                 :            :  * of a node considered for each zone_reclaim. 4 scans 1/16th of
    3496                 :            :  * a zone.
    3497                 :            :  */
    3498                 :            : #define ZONE_RECLAIM_PRIORITY 4
    3499                 :            : 
    3500                 :            : /*
    3501                 :            :  * Percentage of pages in a zone that must be unmapped for zone_reclaim to
    3502                 :            :  * occur.
    3503                 :            :  */
    3504                 :            : int sysctl_min_unmapped_ratio = 1;
    3505                 :            : 
    3506                 :            : /*
    3507                 :            :  * If the number of slab pages in a zone grows beyond this percentage then
    3508                 :            :  * slab reclaim needs to occur.
    3509                 :            :  */
    3510                 :            : int sysctl_min_slab_ratio = 5;
    3511                 :            : 
    3512                 :            : static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
    3513                 :            : {
    3514                 :            :         unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
    3515                 :            :         unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
    3516                 :            :                 zone_page_state(zone, NR_ACTIVE_FILE);
    3517                 :            : 
    3518                 :            :         /*
    3519                 :            :          * It's possible for there to be more file mapped pages than
    3520                 :            :          * accounted for by the pages on the file LRU lists because
    3521                 :            :          * tmpfs pages accounted for as ANON can also be FILE_MAPPED
    3522                 :            :          */
    3523                 :            :         return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
    3524                 :            : }
    3525                 :            : 
    3526                 :            : /* Work out how many page cache pages we can reclaim in this reclaim_mode */
    3527                 :            : static long zone_pagecache_reclaimable(struct zone *zone)
    3528                 :            : {
    3529                 :            :         long nr_pagecache_reclaimable;
    3530                 :            :         long delta = 0;
    3531                 :            : 
    3532                 :            :         /*
    3533                 :            :          * If RECLAIM_SWAP is set, then all file pages are considered
    3534                 :            :          * potentially reclaimable. Otherwise, we have to worry about
    3535                 :            :          * pages like swapcache and zone_unmapped_file_pages() provides
    3536                 :            :          * a better estimate
    3537                 :            :          */
    3538                 :            :         if (zone_reclaim_mode & RECLAIM_SWAP)
    3539                 :            :                 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
    3540                 :            :         else
    3541                 :            :                 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
    3542                 :            : 
    3543                 :            :         /* If we can't clean pages, remove dirty pages from consideration */
    3544                 :            :         if (!(zone_reclaim_mode & RECLAIM_WRITE))
    3545                 :            :                 delta += zone_page_state(zone, NR_FILE_DIRTY);
    3546                 :            : 
    3547                 :            :         /* Watch for any possible underflows due to delta */
    3548                 :            :         if (unlikely(delta > nr_pagecache_reclaimable))
    3549                 :            :                 delta = nr_pagecache_reclaimable;
    3550                 :            : 
    3551                 :            :         return nr_pagecache_reclaimable - delta;
    3552                 :            : }
    3553                 :            : 
    3554                 :            : /*
    3555                 :            :  * Try to free up some pages from this zone through reclaim.
    3556                 :            :  */
    3557                 :            : static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
    3558                 :            : {
    3559                 :            :         /* Minimum pages needed in order to stay on node */
    3560                 :            :         const unsigned long nr_pages = 1 << order;
    3561                 :            :         struct task_struct *p = current;
    3562                 :            :         struct reclaim_state reclaim_state;
    3563                 :            :         struct scan_control sc = {
    3564                 :            :                 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
    3565                 :            :                 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
    3566                 :            :                 .may_swap = 1,
    3567                 :            :                 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
    3568                 :            :                 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
    3569                 :            :                 .order = order,
    3570                 :            :                 .priority = ZONE_RECLAIM_PRIORITY,
    3571                 :            :         };
    3572                 :            :         struct shrink_control shrink = {
    3573                 :            :                 .gfp_mask = sc.gfp_mask,
    3574                 :            :         };
    3575                 :            :         unsigned long nr_slab_pages0, nr_slab_pages1;
    3576                 :            : 
    3577                 :            :         cond_resched();
    3578                 :            :         /*
    3579                 :            :          * We need to be able to allocate from the reserves for RECLAIM_SWAP
    3580                 :            :          * and we also need to be able to write out pages for RECLAIM_WRITE
    3581                 :            :          * and RECLAIM_SWAP.
    3582                 :            :          */
    3583                 :            :         p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
    3584                 :            :         lockdep_set_current_reclaim_state(gfp_mask);
    3585                 :            :         reclaim_state.reclaimed_slab = 0;
    3586                 :            :         p->reclaim_state = &reclaim_state;
    3587                 :            : 
    3588                 :            :         if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
    3589                 :            :                 /*
    3590                 :            :                  * Free memory by calling shrink zone with increasing
    3591                 :            :                  * priorities until we have enough memory freed.
    3592                 :            :                  */
    3593                 :            :                 do {
    3594                 :            :                         shrink_zone(zone, &sc);
    3595                 :            :                 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
    3596                 :            :         }
    3597                 :            : 
    3598                 :            :         nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
    3599                 :            :         if (nr_slab_pages0 > zone->min_slab_pages) {
    3600                 :            :                 /*
    3601                 :            :                  * shrink_slab() does not currently allow us to determine how
    3602                 :            :                  * many pages were freed in this zone. So we take the current
    3603                 :            :                  * number of slab pages and shake the slab until it is reduced
    3604                 :            :                  * by the same nr_pages that we used for reclaiming unmapped
    3605                 :            :                  * pages.
    3606                 :            :                  */
    3607                 :            :                 nodes_clear(shrink.nodes_to_scan);
    3608                 :            :                 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
    3609                 :            :                 for (;;) {
    3610                 :            :                         unsigned long lru_pages = zone_reclaimable_pages(zone);
    3611                 :            : 
    3612                 :            :                         /* No reclaimable slab or very low memory pressure */
    3613                 :            :                         if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
    3614                 :            :                                 break;
    3615                 :            : 
    3616                 :            :                         /* Freed enough memory */
    3617                 :            :                         nr_slab_pages1 = zone_page_state(zone,
    3618                 :            :                                                         NR_SLAB_RECLAIMABLE);
    3619                 :            :                         if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
    3620                 :            :                                 break;
    3621                 :            :                 }
    3622                 :            : 
    3623                 :            :                 /*
    3624                 :            :                  * Update nr_reclaimed by the number of slab pages we
    3625                 :            :                  * reclaimed from this zone.
    3626                 :            :                  */
    3627                 :            :                 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
    3628                 :            :                 if (nr_slab_pages1 < nr_slab_pages0)
    3629                 :            :                         sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
    3630                 :            :         }
    3631                 :            : 
    3632                 :            :         p->reclaim_state = NULL;
    3633                 :            :         current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
    3634                 :            :         lockdep_clear_current_reclaim_state();
    3635                 :            :         return sc.nr_reclaimed >= nr_pages;
    3636                 :            : }
    3637                 :            : 
    3638                 :            : int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
    3639                 :            : {
    3640                 :            :         int node_id;
    3641                 :            :         int ret;
    3642                 :            : 
    3643                 :            :         /*
    3644                 :            :          * Zone reclaim reclaims unmapped file backed pages and
    3645                 :            :          * slab pages if we are over the defined limits.
    3646                 :            :          *
    3647                 :            :          * A small portion of unmapped file backed pages is needed for
    3648                 :            :          * file I/O otherwise pages read by file I/O will be immediately
    3649                 :            :          * thrown out if the zone is overallocated. So we do not reclaim
    3650                 :            :          * if less than a specified percentage of the zone is used by
    3651                 :            :          * unmapped file backed pages.
    3652                 :            :          */
    3653                 :            :         if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
    3654                 :            :             zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
    3655                 :            :                 return ZONE_RECLAIM_FULL;
    3656                 :            : 
    3657                 :            :         if (!zone_reclaimable(zone))
    3658                 :            :                 return ZONE_RECLAIM_FULL;
    3659                 :            : 
    3660                 :            :         /*
    3661                 :            :          * Do not scan if the allocation should not be delayed.
    3662                 :            :          */
    3663                 :            :         if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
    3664                 :            :                 return ZONE_RECLAIM_NOSCAN;
    3665                 :            : 
    3666                 :            :         /*
    3667                 :            :          * Only run zone reclaim on the local zone or on zones that do not
    3668                 :            :          * have associated processors. This will favor the local processor
    3669                 :            :          * over remote processors and spread off node memory allocations
    3670                 :            :          * as wide as possible.
    3671                 :            :          */
    3672                 :            :         node_id = zone_to_nid(zone);
    3673                 :            :         if (node_state(node_id, N_CPU) && node_id != numa_node_id())
    3674                 :            :                 return ZONE_RECLAIM_NOSCAN;
    3675                 :            : 
    3676                 :            :         if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
    3677                 :            :                 return ZONE_RECLAIM_NOSCAN;
    3678                 :            : 
    3679                 :            :         ret = __zone_reclaim(zone, gfp_mask, order);
    3680                 :            :         zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
    3681                 :            : 
    3682                 :            :         if (!ret)
    3683                 :            :                 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
    3684                 :            : 
    3685                 :            :         return ret;
    3686                 :            : }
    3687                 :            : #endif
    3688                 :            : 
    3689                 :            : /*
    3690                 :            :  * page_evictable - test whether a page is evictable
    3691                 :            :  * @page: the page to test
    3692                 :            :  *
    3693                 :            :  * Test whether page is evictable--i.e., should be placed on active/inactive
    3694                 :            :  * lists vs unevictable list.
    3695                 :            :  *
    3696                 :            :  * Reasons page might not be evictable:
    3697                 :            :  * (1) page's mapping marked unevictable
    3698                 :            :  * (2) page is part of an mlocked VMA
    3699                 :            :  *
    3700                 :            :  */
    3701                 :          0 : int page_evictable(struct page *page)
    3702                 :            : {
    3703    [ + ][ +  + ]:     377696 :         return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
    3704                 :            : }
    3705                 :            : 
    3706                 :            : #ifdef CONFIG_SHMEM
    3707                 :            : /**
    3708                 :            :  * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
    3709                 :            :  * @pages:      array of pages to check
    3710                 :            :  * @nr_pages:   number of pages to check
    3711                 :            :  *
    3712                 :            :  * Checks pages for evictability and moves them to the appropriate lru list.
    3713                 :            :  *
    3714                 :            :  * This function is only used for SysV IPC SHM_UNLOCK.
    3715                 :            :  */
    3716                 :          0 : void check_move_unevictable_pages(struct page **pages, int nr_pages)
    3717                 :            : {
    3718                 :            :         struct lruvec *lruvec;
    3719                 :            :         struct zone *zone = NULL;
    3720                 :            :         int pgscanned = 0;
    3721                 :            :         int pgrescued = 0;
    3722                 :            :         int i;
    3723                 :            : 
    3724         [ +  + ]:          2 :         for (i = 0; i < nr_pages; i++) {
    3725                 :          1 :                 struct page *page = pages[i];
    3726                 :            :                 struct zone *pagezone;
    3727                 :            : 
    3728                 :          1 :                 pgscanned++;
    3729                 :          1 :                 pagezone = page_zone(page);
    3730         [ +  - ]:          1 :                 if (pagezone != zone) {
    3731         [ -  + ]:          1 :                         if (zone)
    3732                 :            :                                 spin_unlock_irq(&zone->lru_lock);
    3733                 :            :                         zone = pagezone;
    3734                 :            :                         spin_lock_irq(&zone->lru_lock);
    3735                 :            :                 }
    3736                 :            :                 lruvec = mem_cgroup_page_lruvec(page, zone);
    3737                 :            : 
    3738 [ +  - ][ +  - ]:          2 :                 if (!PageLRU(page) || !PageUnevictable(page))
    3739                 :          1 :                         continue;
    3740                 :            : 
    3741         [ #  # ]:          0 :                 if (page_evictable(page)) {
    3742                 :            :                         enum lru_list lru = page_lru_base_type(page);
    3743                 :            : 
    3744                 :            :                         VM_BUG_ON_PAGE(PageActive(page), page);
    3745                 :            :                         ClearPageUnevictable(page);
    3746                 :            :                         del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
    3747                 :            :                         add_page_to_lru_list(page, lruvec, lru);
    3748                 :          0 :                         pgrescued++;
    3749                 :            :                 }
    3750                 :            :         }
    3751                 :            : 
    3752         [ +  - ]:          1 :         if (zone) {
    3753                 :            :                 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
    3754                 :            :                 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
    3755                 :            :                 spin_unlock_irq(&zone->lru_lock);
    3756                 :            :         }
    3757                 :          1 : }
    3758                 :            : #endif /* CONFIG_SHMEM */
    3759                 :            : 
    3760                 :          0 : static void warn_scan_unevictable_pages(void)
    3761                 :            : {
    3762         [ +  + ]:          2 :         printk_once(KERN_WARNING
    3763                 :            :                     "%s: The scan_unevictable_pages sysctl/node-interface has been "
    3764                 :            :                     "disabled for lack of a legitimate use case.  If you have "
    3765                 :            :                     "one, please send an email to linux-mm@kvack.org.\n",
    3766                 :            :                     current->comm);
    3767                 :          0 : }
    3768                 :            : 
    3769                 :            : /*
    3770                 :            :  * scan_unevictable_pages [vm] sysctl handler.  On demand re-scan of
    3771                 :            :  * all nodes' unevictable lists for evictable pages
    3772                 :            :  */
    3773                 :            : unsigned long scan_unevictable_pages;
    3774                 :            : 
    3775                 :          0 : int scan_unevictable_handler(struct ctl_table *table, int write,
    3776                 :            :                            void __user *buffer,
    3777                 :            :                            size_t *length, loff_t *ppos)
    3778                 :            : {
    3779                 :          2 :         warn_scan_unevictable_pages();
    3780                 :          2 :         proc_doulongvec_minmax(table, write, buffer, length, ppos);
    3781                 :          2 :         scan_unevictable_pages = 0;
    3782                 :          2 :         return 0;
    3783                 :            : }
    3784                 :            : 
    3785                 :            : #ifdef CONFIG_NUMA
    3786                 :            : /*
    3787                 :            :  * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
    3788                 :            :  * a specified node's per zone unevictable lists for evictable pages.
    3789                 :            :  */
    3790                 :            : 
    3791                 :            : static ssize_t read_scan_unevictable_node(struct device *dev,
    3792                 :            :                                           struct device_attribute *attr,
    3793                 :            :                                           char *buf)
    3794                 :            : {
    3795                 :            :         warn_scan_unevictable_pages();
    3796                 :            :         return sprintf(buf, "0\n");   /* always zero; should fit... */
    3797                 :            : }
    3798                 :            : 
    3799                 :            : static ssize_t write_scan_unevictable_node(struct device *dev,
    3800                 :            :                                            struct device_attribute *attr,
    3801                 :            :                                         const char *buf, size_t count)
    3802                 :            : {
    3803                 :            :         warn_scan_unevictable_pages();
    3804                 :            :         return 1;
    3805                 :            : }
    3806                 :            : 
    3807                 :            : 
    3808                 :            : static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
    3809                 :            :                         read_scan_unevictable_node,
    3810                 :            :                         write_scan_unevictable_node);
    3811                 :            : 
    3812                 :            : int scan_unevictable_register_node(struct node *node)
    3813                 :            : {
    3814                 :            :         return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
    3815                 :            : }
    3816                 :            : 
    3817                 :            : void scan_unevictable_unregister_node(struct node *node)
    3818                 :            : {
    3819                 :            :         device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
    3820                 :            : }
    3821                 :            : #endif

Generated by: LCOV version 1.9