LCOV - coverage.info - mm/vmscan.c

LCOV - code coverage report

Current view:	top level - mm - vmscan.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	522	779	67.0 %
Date:	2014-04-07	Functions:	47	57	82.5 %
		Branches:	405	729	55.6 %

           Branch data     Line data    Source code

       1                 :            : /*
       2                 :            :  *  linux/mm/vmscan.c
       3                 :            :  *
       4                 :            :  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
       5                 :            :  *
       6                 :            :  *  Swap reorganised 29.12.95, Stephen Tweedie.
       7                 :            :  *  kswapd added: 7.1.96  sct
       8                 :            :  *  Removed kswapd_ctl limits, and swap out as many pages as needed
       9                 :            :  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
      10                 :            :  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
      11                 :            :  *  Multiqueue VM started 5.8.00, Rik van Riel.
      12                 :            :  */
      13                 :            : 
      14                 :            : #include <linux/mm.h>
      15                 :            : #include <linux/module.h>
      16                 :            : #include <linux/gfp.h>
      17                 :            : #include <linux/kernel_stat.h>
      18                 :            : #include <linux/swap.h>
      19                 :            : #include <linux/pagemap.h>
      20                 :            : #include <linux/init.h>
      21                 :            : #include <linux/highmem.h>
      22                 :            : #include <linux/vmpressure.h>
      23                 :            : #include <linux/vmstat.h>
      24                 :            : #include <linux/file.h>
      25                 :            : #include <linux/writeback.h>
      26                 :            : #include <linux/blkdev.h>
      27                 :            : #include <linux/buffer_head.h>    /* for try_to_release_page(),
      28                 :            :                                         buffer_heads_over_limit */
      29                 :            : #include <linux/mm_inline.h>
      30                 :            : #include <linux/backing-dev.h>
      31                 :            : #include <linux/rmap.h>
      32                 :            : #include <linux/topology.h>
      33                 :            : #include <linux/cpu.h>
      34                 :            : #include <linux/cpuset.h>
      35                 :            : #include <linux/compaction.h>
      36                 :            : #include <linux/notifier.h>
      37                 :            : #include <linux/rwsem.h>
      38                 :            : #include <linux/delay.h>
      39                 :            : #include <linux/kthread.h>
      40                 :            : #include <linux/freezer.h>
      41                 :            : #include <linux/memcontrol.h>
      42                 :            : #include <linux/delayacct.h>
      43                 :            : #include <linux/sysctl.h>
      44                 :            : #include <linux/oom.h>
      45                 :            : #include <linux/prefetch.h>
      46                 :            : #include <linux/debugfs.h>
      47                 :            : 
      48                 :            : #include <asm/tlbflush.h>
      49                 :            : #include <asm/div64.h>
      50                 :            : 
      51                 :            : #include <linux/swapops.h>
      52                 :            : #include <linux/balloon_compaction.h>
      53                 :            : 
      54                 :            : #include "internal.h"
      55                 :            : 
      56                 :            : #define CREATE_TRACE_POINTS
      57                 :            : #include <trace/events/vmscan.h>
      58                 :            : 
      59                 :            : struct scan_control {
      60                 :            :         /* Incremented by the number of inactive pages that were scanned */
      61                 :            :         unsigned long nr_scanned;
      62                 :            : 
      63                 :            :         /* Number of pages freed so far during a call to shrink_zones() */
      64                 :            :         unsigned long nr_reclaimed;
      65                 :            : 
      66                 :            :         /* How many pages shrink_list() should reclaim */
      67                 :            :         unsigned long nr_to_reclaim;
      68                 :            : 
      69                 :            :         unsigned long hibernation_mode;
      70                 :            : 
      71                 :            :         /* This context's GFP mask */
      72                 :            :         gfp_t gfp_mask;
      73                 :            : 
      74                 :            :         int may_writepage;
      75                 :            : 
      76                 :            :         /* Can mapped pages be reclaimed? */
      77                 :            :         int may_unmap;
      78                 :            : 
      79                 :            :         /* Can pages be swapped as part of reclaim? */
      80                 :            :         int may_swap;
      81                 :            : 
      82                 :            :         int order;
      83                 :            : 
      84                 :            :         /* Scan (total_size >> priority) pages at once */
      85                 :            :         int priority;
      86                 :            : 
      87                 :            :         /*
      88                 :            :          * The memory cgroup that hit its limit and as a result is the
      89                 :            :          * primary target of this reclaim invocation.
      90                 :            :          */
      91                 :            :         struct mem_cgroup *target_mem_cgroup;
      92                 :            : 
      93                 :            :         /*
      94                 :            :          * Nodemask of nodes allowed by the caller. If NULL, all nodes
      95                 :            :          * are scanned.
      96                 :            :          */
      97                 :            :         nodemask_t      *nodemask;
      98                 :            : };
      99                 :            : 
     100                 :            : #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
     101                 :            : 
     102                 :            : #ifdef ARCH_HAS_PREFETCH
     103                 :            : #define prefetch_prev_lru_page(_page, _base, _field)                    \
     104                 :            :         do {                                                            \
     105                 :            :                 if ((_page)->lru.prev != _base) {                    \
     106                 :            :                         struct page *prev;                              \
     107                 :            :                                                                         \
     108                 :            :                         prev = lru_to_page(&(_page->lru));               \
     109                 :            :                         prefetch(&prev->_field);                 \
     110                 :            :                 }                                                       \
     111                 :            :         } while (0)
     112                 :            : #else
     113                 :            : #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
     114                 :            : #endif
     115                 :            : 
     116                 :            : #ifdef ARCH_HAS_PREFETCHW
     117                 :            : #define prefetchw_prev_lru_page(_page, _base, _field)                   \
     118                 :            :         do {                                                            \
     119                 :            :                 if ((_page)->lru.prev != _base) {                    \
     120                 :            :                         struct page *prev;                              \
     121                 :            :                                                                         \
     122                 :            :                         prev = lru_to_page(&(_page->lru));               \
     123                 :            :                         prefetchw(&prev->_field);                        \
     124                 :            :                 }                                                       \
     125                 :            :         } while (0)
     126                 :            : #else
     127                 :            : #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
     128                 :            : #endif
     129                 :            : 
     130                 :            : /*
     131                 :            :  * From 0 .. 100.  Higher means more swappy.
     132                 :            :  */
     133                 :            : int vm_swappiness = 60;
     134                 :            : unsigned long vm_total_pages;   /* The total number of pages which the VM controls */
     135                 :            : 
     136                 :            : static LIST_HEAD(shrinker_list);
     137                 :            : static DECLARE_RWSEM(shrinker_rwsem);
     138                 :            : 
     139                 :            : #ifdef CONFIG_MEMCG
     140                 :            : static bool global_reclaim(struct scan_control *sc)
     141                 :            : {
     142                 :            :         return !sc->target_mem_cgroup;
     143                 :            : }
     144                 :            : #else
     145                 :            : static bool global_reclaim(struct scan_control *sc)
     146                 :            : {
     147                 :            :         return true;
     148                 :            : }
     149                 :            : #endif
     150                 :            : 
     151                 :          0 : unsigned long zone_reclaimable_pages(struct zone *zone)
     152                 :            : {
     153                 :            :         int nr;
     154                 :            : 
     155                 :    2673939 :         nr = zone_page_state(zone, NR_ACTIVE_FILE) +
     156                 :            :              zone_page_state(zone, NR_INACTIVE_FILE);
     157                 :            : 
     158            [ + ]:    2673939 :         if (get_nr_swap_pages() > 0)
     159                 :       1278 :                 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
     160                 :            :                       zone_page_state(zone, NR_INACTIVE_ANON);
     161                 :            : 
     162                 :          0 :         return nr;
     163                 :            : }
     164                 :            : 
     165                 :          0 : bool zone_reclaimable(struct zone *zone)
     166                 :            : {
     167                 :     464618 :         return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
     168                 :            : }
     169                 :            : 
     170                 :            : static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
     171                 :            : {
     172                 :            :         if (!mem_cgroup_disabled())
     173                 :            :                 return mem_cgroup_get_lru_size(lruvec, lru);
     174                 :            : 
     175                 :     811779 :         return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
     176                 :            : }
     177                 :            : 
     178                 :            : struct dentry *debug_file;
     179                 :            : 
     180                 :          0 : static int debug_shrinker_show(struct seq_file *s, void *unused)
     181                 :            : {
     182                 :            :         struct shrinker *shrinker;
     183                 :            :         struct shrink_control sc;
     184                 :            : 
     185                 :          0 :         sc.gfp_mask = -1;
     186                 :          0 :         sc.nr_to_scan = 0;
     187                 :            : 
     188                 :          0 :         down_read(&shrinker_rwsem);
     189         [ #  # ]:          0 :         list_for_each_entry(shrinker, &shrinker_list, list) {
     190                 :            :                 int num_objs;
     191                 :            : 
     192                 :          0 :                 num_objs = shrinker->count_objects(shrinker, &sc);
     193                 :          0 :                 seq_printf(s, "%pf %d\n", shrinker->count_objects, num_objs);
     194                 :            :         }
     195                 :          0 :         up_read(&shrinker_rwsem);
     196                 :          0 :         return 0;
     197                 :            : }
     198                 :            : 
     199                 :          0 : static int debug_shrinker_open(struct inode *inode, struct file *file)
     200                 :            : {
     201                 :          0 :         return single_open(file, debug_shrinker_show, inode->i_private);
     202                 :            : }
     203                 :            : 
     204                 :            : static const struct file_operations debug_shrinker_fops = {
     205                 :            :         .open = debug_shrinker_open,
     206                 :            :         .read = seq_read,
     207                 :            :         .llseek = seq_lseek,
     208                 :            :         .release = single_release,
     209                 :            : };
     210                 :            : 
     211                 :            : /*
     212                 :            :  * Add a shrinker callback to be called from the vm.
     213                 :            :  */
     214                 :          0 : int register_shrinker(struct shrinker *shrinker)
     215                 :            : {
     216                 :            :         size_t size = sizeof(*shrinker->nr_deferred);
     217                 :            : 
     218                 :            :         /*
     219                 :            :          * If we only have one possible node in the system anyway, save
     220                 :            :          * ourselves the trouble and disable NUMA aware behavior. This way we
     221                 :            :          * will save memory and some small loop time later.
     222                 :            :          */
     223                 :            :         if (nr_node_ids == 1)
     224                 :          3 :                 shrinker->flags &= ~SHRINKER_NUMA_AWARE;
     225                 :            : 
     226                 :            :         if (shrinker->flags & SHRINKER_NUMA_AWARE)
     227                 :            :                 size *= nr_node_ids;
     228                 :            : 
     229                 :          3 :         shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
     230         [ +  - ]:          3 :         if (!shrinker->nr_deferred)
     231                 :            :                 return -ENOMEM;
     232                 :            : 
     233                 :          3 :         down_write(&shrinker_rwsem);
     234                 :          3 :         list_add_tail(&shrinker->list, &shrinker_list);
     235                 :          3 :         up_write(&shrinker_rwsem);
     236                 :          3 :         return 0;
     237                 :            : }
     238                 :            : EXPORT_SYMBOL(register_shrinker);
     239                 :            : 
     240                 :          0 : static int __init add_shrinker_debug(void)
     241                 :            : {
     242                 :          0 :         debugfs_create_file("shrinker", 0644, NULL, NULL,
     243                 :            :                             &debug_shrinker_fops);
     244                 :          0 :         return 0;
     245                 :            : }
     246                 :            : 
     247                 :            : late_initcall(add_shrinker_debug);
     248                 :            : 
     249                 :            : /*
     250                 :            :  * Remove one
     251                 :            :  */
     252                 :          0 : void unregister_shrinker(struct shrinker *shrinker)
     253                 :            : {
     254                 :          3 :         down_write(&shrinker_rwsem);
     255                 :            :         list_del(&shrinker->list);
     256                 :          3 :         up_write(&shrinker_rwsem);
     257                 :          3 :         kfree(shrinker->nr_deferred);
     258                 :          3 : }
     259                 :            : EXPORT_SYMBOL(unregister_shrinker);
     260                 :            : 
     261                 :            : #define SHRINK_BATCH 128
     262                 :            : 
     263                 :            : static unsigned long
     264                 :          0 : shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
     265                 :            :                  unsigned long nr_pages_scanned, unsigned long lru_pages)
     266                 :            : {
     267                 :            :         unsigned long freed = 0;
     268                 :            :         unsigned long long delta;
     269                 :            :         long total_scan;
     270                 :            :         long max_pass;
     271                 :            :         long nr;
     272                 :            :         long new_nr;
     273                 :    4604685 :         int nid = shrinkctl->nid;
     274                 :    4604685 :         long batch_size = shrinker->batch ? shrinker->batch
     275         [ +  + ]:    4604685 :                                           : SHRINK_BATCH;
     276                 :            : 
     277                 :    4604685 :         max_pass = shrinker->count_objects(shrinker, shrinkctl);
     278         [ +  + ]:    4604843 :         if (max_pass == 0)
     279                 :            :                 return 0;
     280                 :            : 
     281                 :            :         /*
     282                 :            :          * copy the current shrinker scan count into a local variable
     283                 :            :          * and zero it so that other concurrent shrinker invocations
     284                 :            :          * don't also do this scanning work.
     285                 :            :          */
     286                 :    5965396 :         nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
     287                 :            : 
     288                 :            :         total_scan = nr;
     289                 :    5284764 :         delta = (4 * nr_pages_scanned) / shrinker->seeks;
     290                 :    5284764 :         delta *= max_pass;
     291 [ -  + ][ #  # ]:    5284764 :         do_div(delta, lru_pages + 1);
         [ -  + ][ -  + ]
         [ -  + ][ -  + ]
         [ -  + ][ -  + ]
         [ -  + ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
                 [ #  # ]
     292                 :     680079 :         total_scan += delta;
     293         [ -  + ]:     680079 :         if (total_scan < 0) {
     294                 :          0 :                 printk(KERN_ERR
     295                 :            :                 "shrink_slab: %pF negative objects to delete nr=%ld\n",
     296                 :            :                        shrinker->scan_objects, total_scan);
     297                 :            :                 total_scan = max_pass;
     298                 :            :         }
     299                 :            : 
     300                 :            :         /*
     301                 :            :          * We need to avoid excessive windup on filesystem shrinkers
     302                 :            :          * due to large numbers of GFP_NOFS allocations causing the
     303                 :            :          * shrinkers to return -1 all the time. This results in a large
     304                 :            :          * nr being built up so when a shrink that can do some work
     305                 :            :          * comes along it empties the entire cache due to nr >>>
     306                 :            :          * max_pass.  This is bad for sustaining a working set in
     307                 :            :          * memory.
     308                 :            :          *
     309                 :            :          * Hence only allow the shrinker to scan the entire cache when
     310                 :            :          * a large delta change is calculated directly.
     311                 :            :          */
     312         [ +  + ]:     679779 :         if (delta < max_pass / 4)
     313                 :      12157 :                 total_scan = min(total_scan, max_pass / 2);
     314                 :            : 
     315                 :            :         /*
     316                 :            :          * Avoid risking looping forever due to too large nr value:
     317                 :            :          * never try to free more than twice the estimate number of
     318                 :            :          * freeable entries.
     319                 :            :          */
     320         [ +  + ]:     679779 :         if (total_scan > max_pass * 2)
     321                 :            :                 total_scan = max_pass * 2;
     322                 :            : 
     323                 :     679779 :         trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
     324                 :            :                                 nr_pages_scanned, lru_pages,
     325                 :            :                                 max_pass, delta, total_scan);
     326                 :            : 
     327         [ +  + ]:     679845 :         while (total_scan >= batch_size) {
     328                 :            :                 unsigned long ret;
     329                 :            : 
     330                 :        217 :                 shrinkctl->nr_to_scan = batch_size;
     331                 :        217 :                 ret = shrinker->scan_objects(shrinker, shrinkctl);
     332         [ +  - ]:        217 :                 if (ret == SHRINK_STOP)
     333                 :            :                         break;
     334                 :        217 :                 freed += ret;
     335                 :            : 
     336                 :            :                 count_vm_events(SLABS_SCANNED, batch_size);
     337                 :        217 :                 total_scan -= batch_size;
     338                 :            : 
     339                 :        217 :                 cond_resched();
     340                 :            :         }
     341                 :            : 
     342                 :            :         /*
     343                 :            :          * move the unused scan count back into the shrinker in a
     344                 :            :          * manner that handles concurrent updates. If we exhausted the
     345                 :            :          * scan, there is no need to do an update.
     346                 :            :          */
     347         [ +  + ]:     679628 :         if (total_scan > 0)
     348                 :     679536 :                 new_nr = atomic_long_add_return(total_scan,
     349                 :     679536 :                                                 &shrinker->nr_deferred[nid]);
     350                 :            :         else
     351                 :         92 :                 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
     352                 :            : 
     353                 :     679208 :         trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
     354                 :     679208 :         return freed;
     355                 :            : }
     356                 :            : 
     357                 :            : /*
     358                 :            :  * Call the shrink functions to age shrinkable caches
     359                 :            :  *
     360                 :            :  * Here we assume it costs one seek to replace a lru page and that it also
     361                 :            :  * takes a seek to recreate a cache object.  With this in mind we age equal
     362                 :            :  * percentages of the lru and ageable caches.  This should balance the seeks
     363                 :            :  * generated by these structures.
     364                 :            :  *
     365                 :            :  * If the vm encountered mapped pages on the LRU it increase the pressure on
     366                 :            :  * slab to avoid swapping.
     367                 :            :  *
     368                 :            :  * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
     369                 :            :  *
     370                 :            :  * `lru_pages' represents the number of on-LRU pages in all the zones which
     371                 :            :  * are eligible for the caller's allocation attempt.  It is used for balancing
     372                 :            :  * slab reclaim versus page reclaim.
     373                 :            :  *
     374                 :            :  * Returns the number of slab objects which we shrunk.
     375                 :            :  */
     376                 :          0 : unsigned long shrink_slab(struct shrink_control *shrinkctl,
     377                 :            :                           unsigned long nr_pages_scanned,
     378                 :            :                           unsigned long lru_pages)
     379                 :            : {
     380                 :            :         struct shrinker *shrinker;
     381                 :            :         unsigned long freed = 0;
     382                 :            : 
     383         [ +  + ]:     170903 :         if (nr_pages_scanned == 0)
     384                 :            :                 nr_pages_scanned = SWAP_CLUSTER_MAX;
     385                 :            : 
     386         [ +  + ]:     170903 :         if (!down_read_trylock(&shrinker_rwsem)) {
     387                 :            :                 /*
     388                 :            :                  * If we would return 0, our callers would understand that we
     389                 :            :                  * have nothing else to shrink and give up trying. By returning
     390                 :            :                  * 1 we keep it going and assume we'll be able to shrink next
     391                 :            :                  * time.
     392                 :            :                  */
     393                 :            :                 freed = 1;
     394                 :            :                 goto out;
     395                 :            :         }
     396                 :            : 
     397         [ +  + ]:    4772154 :         list_for_each_entry(shrinker, &shrinker_list, list) {
     398    [ + ][ +  + ]:    9199179 :                 for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
     399            [ + ]:    4601992 :                         if (!node_online(shrinkctl->nid))
     400                 :          0 :                                 continue;
     401                 :            : 
     402    [ + ][ +  + ]:    4602097 :                         if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
     403                 :            :                             (shrinkctl->nid != 0))
     404                 :            :                                 break;
     405                 :            : 
     406                 :    4597025 :                         freed += shrink_slab_node(shrinkctl, shrinker,
     407                 :            :                                  nr_pages_scanned, lru_pages);
     408                 :            : 
     409                 :            :                 }
     410                 :            :         }
     411                 :     170962 :         up_read(&shrinker_rwsem);
     412                 :            : out:
     413                 :     172026 :         cond_resched();
     414                 :     170933 :         return freed;
     415                 :            : }
     416                 :            : 
     417                 :        288 : static inline int is_page_cache_freeable(struct page *page)
     418                 :            : {
     419                 :            :         /*
     420                 :            :          * A freeable page cache page is referenced only by the caller
     421                 :            :          * that isolated the page, the page cache radix tree and
     422                 :            :          * optional buffer heads at page->private.
     423                 :            :          */
     424                 :        288 :         return page_count(page) - page_has_private(page) == 2;
     425                 :            : }
     426                 :            : 
     427                 :        288 : static int may_write_to_queue(struct backing_dev_info *bdi,
     428                 :            :                               struct scan_control *sc)
     429                 :            : {
     430         [ -  + ]:        288 :         if (current->flags & PF_SWAPWRITE)
     431                 :            :                 return 1;
     432         [ #  # ]:          0 :         if (!bdi_write_congested(bdi))
     433                 :            :                 return 1;
     434         [ #  # ]:          0 :         if (bdi == current->backing_dev_info)
     435                 :            :                 return 1;
     436                 :            :         return 0;
     437                 :            : }
     438                 :            : 
     439                 :            : /*
     440                 :            :  * We detected a synchronous write error writing a page out.  Probably
     441                 :            :  * -ENOSPC.  We need to propagate that into the address_space for a subsequent
     442                 :            :  * fsync(), msync() or close().
     443                 :            :  *
     444                 :            :  * The tricky part is that after writepage we cannot touch the mapping: nothing
     445                 :            :  * prevents it from being freed up.  But we have a ref on the page and once
     446                 :            :  * that page is locked, the mapping is pinned.
     447                 :            :  *
     448                 :            :  * We're allowed to run sleeping lock_page() here because we know the caller has
     449                 :            :  * __GFP_FS.
     450                 :            :  */
     451                 :          0 : static void handle_write_error(struct address_space *mapping,
     452                 :            :                                 struct page *page, int error)
     453                 :            : {
     454                 :            :         lock_page(page);
     455         [ #  # ]:          0 :         if (page_mapping(page) == mapping)
     456                 :            :                 mapping_set_error(mapping, error);
     457                 :          0 :         unlock_page(page);
     458                 :          0 : }
     459                 :            : 
     460                 :            : /* possible outcome of pageout() */
     461                 :            : typedef enum {
     462                 :            :         /* failed to write page out, page is locked */
     463                 :            :         PAGE_KEEP,
     464                 :            :         /* move page to the active list, page is locked */
     465                 :            :         PAGE_ACTIVATE,
     466                 :            :         /* page has been sent to the disk successfully, page is unlocked */
     467                 :            :         PAGE_SUCCESS,
     468                 :            :         /* page is clean and locked */
     469                 :            :         PAGE_CLEAN,
     470                 :            : } pageout_t;
     471                 :            : 
     472                 :            : /*
     473                 :            :  * pageout is called by shrink_page_list() for each dirty page.
     474                 :            :  * Calls ->writepage().
     475                 :            :  */
     476                 :          0 : static pageout_t pageout(struct page *page, struct address_space *mapping,
     477                 :            :                          struct scan_control *sc)
     478                 :            : {
     479                 :            :         /*
     480                 :            :          * If the page is dirty, only perform writeback if that write
     481                 :            :          * will be non-blocking.  To prevent this allocation from being
     482                 :            :          * stalled by pagecache activity.  But note that there may be
     483                 :            :          * stalls if we need to run get_block().  We could test
     484                 :            :          * PagePrivate for that.
     485                 :            :          *
     486                 :            :          * If this process is currently in __generic_file_aio_write() against
     487                 :            :          * this page's queue, we can perform writeback even if that
     488                 :            :          * will block.
     489                 :            :          *
     490                 :            :          * If the page is swapcache, write it back even if that would
     491                 :            :          * block, for some throttling. This happens by accident, because
     492                 :            :          * swap_backing_dev_info is bust: it doesn't reflect the
     493                 :            :          * congestion state of the swapdevs.  Easy to fix, if needed.
     494                 :            :          */
     495         [ +  - ]:        288 :         if (!is_page_cache_freeable(page))
     496                 :            :                 return PAGE_KEEP;
     497         [ -  + ]:        288 :         if (!mapping) {
     498                 :            :                 /*
     499                 :            :                  * Some data journaling orphaned pages can have
     500                 :            :                  * page->mapping == NULL while being dirty with clean buffers.
     501                 :            :                  */
     502         [ #  # ]:          0 :                 if (page_has_private(page)) {
     503         [ #  # ]:          0 :                         if (try_to_free_buffers(page)) {
     504                 :            :                                 ClearPageDirty(page);
     505                 :          0 :                                 printk("%s: orphaned page\n", __func__);
     506                 :            :                                 return PAGE_CLEAN;
     507                 :            :                         }
     508                 :            :                 }
     509                 :            :                 return PAGE_KEEP;
     510                 :            :         }
     511         [ +  - ]:        288 :         if (mapping->a_ops->writepage == NULL)
     512                 :            :                 return PAGE_ACTIVATE;
     513         [ +  - ]:        288 :         if (!may_write_to_queue(mapping->backing_dev_info, sc))
     514                 :            :                 return PAGE_KEEP;
     515                 :            : 
     516         [ +  - ]:        288 :         if (clear_page_dirty_for_io(page)) {
     517                 :            :                 int res;
     518                 :        288 :                 struct writeback_control wbc = {
     519                 :            :                         .sync_mode = WB_SYNC_NONE,
     520                 :            :                         .nr_to_write = SWAP_CLUSTER_MAX,
     521                 :            :                         .range_start = 0,
     522                 :            :                         .range_end = LLONG_MAX,
     523                 :            :                         .for_reclaim = 1,
     524                 :            :                 };
     525                 :            : 
     526                 :            :                 SetPageReclaim(page);
     527                 :        288 :                 res = mapping->a_ops->writepage(page, &wbc);
     528         [ -  + ]:        288 :                 if (res < 0)
     529                 :          0 :                         handle_write_error(mapping, page, res);
     530         [ -  + ]:        288 :                 if (res == AOP_WRITEPAGE_ACTIVATE) {
     531                 :            :                         ClearPageReclaim(page);
     532                 :            :                         return PAGE_ACTIVATE;
     533                 :            :                 }
     534                 :            : 
     535         [ +  + ]:        288 :                 if (!PageWriteback(page)) {
     536                 :            :                         /* synchronous write or broken a_ops? */
     537                 :            :                         ClearPageReclaim(page);
     538                 :            :                 }
     539         [ -  + ]:        576 :                 trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
     540                 :        288 :                 inc_zone_page_state(page, NR_VMSCAN_WRITE);
     541                 :            :                 return PAGE_SUCCESS;
     542                 :            :         }
     543                 :            : 
     544                 :            :         return PAGE_CLEAN;
     545                 :            : }
     546                 :            : 
     547                 :            : /*
     548                 :            :  * Same as remove_mapping, but if the page is removed from the mapping, it
     549                 :            :  * gets returned with a refcount of 0.
     550                 :            :  */
     551                 :          0 : static int __remove_mapping(struct address_space *mapping, struct page *page)
     552                 :            : {
     553         [ -  + ]:     276841 :         BUG_ON(!PageLocked(page));
     554         [ -  + ]:     276841 :         BUG_ON(mapping != page_mapping(page));
     555                 :            : 
     556                 :            :         spin_lock_irq(&mapping->tree_lock);
     557                 :            :         /*
     558                 :            :          * The non racy check for a busy page.
     559                 :            :          *
     560                 :            :          * Must be careful with the order of the tests. When someone has
     561                 :            :          * a ref to the page, it may be possible that they dirty it then
     562                 :            :          * drop the reference. So if PageDirty is tested before page_count
     563                 :            :          * here, then the following race may occur:
     564                 :            :          *
     565                 :            :          * get_user_pages(&page);
     566                 :            :          * [user mapping goes away]
     567                 :            :          * write_to(page);
     568                 :            :          *                              !PageDirty(page)    [good]
     569                 :            :          * SetPageDirty(page);
     570                 :            :          * put_page(page);
     571                 :            :          *                              !page_count(page)   [good, discard it]
     572                 :            :          *
     573                 :            :          * [oops, our write_to data is lost]
     574                 :            :          *
     575                 :            :          * Reversing the order of the tests ensures such a situation cannot
     576                 :            :          * escape unnoticed. The smp_rmb is needed to ensure the page->flags
     577                 :            :          * load is not satisfied before that of page->_count.
     578                 :            :          *
     579                 :            :          * Note that if SetPageDirty is always performed via set_page_dirty,
     580                 :            :          * and thus under tree_lock, then this ordering is not required.
     581                 :            :          */
     582         [ +  + ]:     276841 :         if (!page_freeze_refs(page, 2))
     583                 :            :                 goto cannot_free;
     584                 :            :         /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
     585         [ -  + ]:     276674 :         if (unlikely(PageDirty(page))) {
     586                 :            :                 page_unfreeze_refs(page, 2);
     587                 :            :                 goto cannot_free;
     588                 :            :         }
     589                 :            : 
     590         [ -  + ]:     276674 :         if (PageSwapCache(page)) {
     591                 :          0 :                 swp_entry_t swap = { .val = page_private(page) };
     592                 :          0 :                 __delete_from_swap_cache(page);
     593                 :            :                 spin_unlock_irq(&mapping->tree_lock);
     594                 :          0 :                 swapcache_free(swap, page);
     595                 :            :         } else {
     596                 :            :                 void (*freepage)(struct page *);
     597                 :            : 
     598                 :     276674 :                 freepage = mapping->a_ops->freepage;
     599                 :            : 
     600                 :     276674 :                 __delete_from_page_cache(page);
     601                 :            :                 spin_unlock_irq(&mapping->tree_lock);
     602                 :            :                 mem_cgroup_uncharge_cache_page(page);
     603                 :            : 
     604         [ -  + ]:     276674 :                 if (freepage != NULL)
     605                 :          0 :                         freepage(page);
     606                 :            :         }
     607                 :            : 
     608                 :            :         return 1;
     609                 :            : 
     610                 :            : cannot_free:
     611                 :            :         spin_unlock_irq(&mapping->tree_lock);
     612                 :        167 :         return 0;
     613                 :            : }
     614                 :            : 
     615                 :            : /*
     616                 :            :  * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
     617                 :            :  * someone else has a ref on the page, abort and return 0.  If it was
     618                 :            :  * successfully detached, return 1.  Assumes the caller has a single ref on
     619                 :            :  * this page.
     620                 :            :  */
     621                 :          0 : int remove_mapping(struct address_space *mapping, struct page *page)
     622                 :            : {
     623         [ +  + ]:     156436 :         if (__remove_mapping(mapping, page)) {
     624                 :            :                 /*
     625                 :            :                  * Unfreezing the refcount with 1 rather than 2 effectively
     626                 :            :                  * drops the pagecache ref for us without requiring another
     627                 :            :                  * atomic operation.
     628                 :            :                  */
     629                 :            :                 page_unfreeze_refs(page, 1);
     630                 :     156401 :                 return 1;
     631                 :            :         }
     632                 :            :         return 0;
     633                 :            : }
     634                 :            : 
     635                 :            : /**
     636                 :            :  * putback_lru_page - put previously isolated page onto appropriate LRU list
     637                 :            :  * @page: page to be put back to appropriate lru list
     638                 :            :  *
     639                 :            :  * Add previously isolated @page to appropriate LRU list.
     640                 :            :  * Page may still be unevictable for other reasons.
     641                 :            :  *
     642                 :            :  * lru_lock must not be held, interrupts must be enabled.
     643                 :            :  */
     644                 :          0 : void putback_lru_page(struct page *page)
     645                 :            : {
     646                 :            :         bool is_unevictable;
     647                 :            :         int was_unevictable = PageUnevictable(page);
     648                 :            : 
     649                 :            :         VM_BUG_ON(PageLRU(page));
     650                 :            : 
     651                 :            : redo:
     652                 :            :         ClearPageUnevictable(page);
     653                 :            : 
     654         [ +  + ]:       5684 :         if (page_evictable(page)) {
     655                 :            :                 /*
     656                 :            :                  * For evictable pages, we can use the cache.
     657                 :            :                  * In event of a race, worst case is we end up with an
     658                 :            :                  * unevictable page on [in]active list.
     659                 :            :                  * We know how to handle that.
     660                 :            :                  */
     661                 :            :                 is_unevictable = false;
     662                 :       1902 :                 lru_cache_add(page);
     663                 :            :         } else {
     664                 :            :                 /*
     665                 :            :                  * Put unevictable pages directly on zone's unevictable
     666                 :            :                  * list.
     667                 :            :                  */
     668                 :            :                 is_unevictable = true;
     669                 :       3782 :                 add_page_to_unevictable_list(page);
     670                 :            :                 /*
     671                 :            :                  * When racing with an mlock or AS_UNEVICTABLE clearing
     672                 :            :                  * (page is unlocked) make sure that if the other thread
     673                 :            :                  * does not observe our setting of PG_lru and fails
     674                 :            :                  * isolation/check_move_unevictable_pages,
     675                 :            :                  * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
     676                 :            :                  * the page back to the evictable list.
     677                 :            :                  *
     678                 :            :                  * The other side is TestClearPageMlocked() or shmem_lock().
     679                 :            :                  */
     680                 :       3782 :                 smp_mb();
     681                 :            :         }
     682                 :            : 
     683                 :            :         /*
     684                 :            :          * page's status can change while we move it among lru. If an evictable
     685                 :            :          * page is on unevictable list, it never be freed. To avoid that,
     686                 :            :          * check after we added it to the list, again.
     687                 :            :          */
     688 [ +  + ][ -  + ]:       5684 :         if (is_unevictable && page_evictable(page)) {
     689         [ #  # ]:          0 :                 if (!isolate_lru_page(page)) {
     690                 :          0 :                         put_page(page);
     691                 :          0 :                         goto redo;
     692                 :            :                 }
     693                 :            :                 /* This means someone else dropped this page from LRU
     694                 :            :                  * So, it will be freed or putback to LRU again. There is
     695                 :            :                  * nothing to do here.
     696                 :            :                  */
     697                 :            :         }
     698                 :            : 
     699         [ +  + ]:       5684 :         if (was_unevictable && !is_unevictable)
     700                 :            :                 count_vm_event(UNEVICTABLE_PGRESCUED);
     701         [ +  - ]:       3782 :         else if (!was_unevictable && is_unevictable)
     702                 :            :                 count_vm_event(UNEVICTABLE_PGCULLED);
     703                 :            : 
     704                 :       5684 :         put_page(page);         /* drop ref from isolate */
     705                 :       5684 : }
     706                 :            : 
     707                 :            : enum page_references {
     708                 :            :         PAGEREF_RECLAIM,
     709                 :            :         PAGEREF_RECLAIM_CLEAN,
     710                 :            :         PAGEREF_KEEP,
     711                 :            :         PAGEREF_ACTIVATE,
     712                 :            : };
     713                 :            : 
     714                 :     151090 : static enum page_references page_check_references(struct page *page,
     715                 :            :                                                   struct scan_control *sc)
     716                 :            : {
     717                 :            :         int referenced_ptes, referenced_page;
     718                 :            :         unsigned long vm_flags;
     719                 :            : 
     720                 :     151090 :         referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
     721                 :            :                                           &vm_flags);
     722                 :            :         referenced_page = TestClearPageReferenced(page);
     723                 :            : 
     724                 :            :         /*
     725                 :            :          * Mlock lost the isolation race with us.  Let try_to_unmap()
     726                 :            :          * move the page to the unevictable list.
     727                 :            :          */
     728         [ +  + ]:     151090 :         if (vm_flags & VM_LOCKED)
     729                 :            :                 return PAGEREF_RECLAIM;
     730                 :            : 
     731         [ +  + ]:     151089 :         if (referenced_ptes) {
     732         [ +  - ]:        637 :                 if (PageSwapBacked(page))
     733                 :            :                         return PAGEREF_ACTIVATE;
     734                 :            :                 /*
     735                 :            :                  * All mapped pages start out with page table
     736                 :            :                  * references from the instantiating fault, so we need
     737                 :            :                  * to look twice if a mapped file page is used more
     738                 :            :                  * than once.
     739                 :            :                  *
     740                 :            :                  * Mark it and spare it for another trip around the
     741                 :            :                  * inactive list.  Another page table reference will
     742                 :            :                  * lead to its activation.
     743                 :            :                  *
     744                 :            :                  * Note: the mark is set for activated pages as well
     745                 :            :                  * so that recently deactivated but used pages are
     746                 :            :                  * quickly recovered.
     747                 :            :                  */
     748                 :            :                 SetPageReferenced(page);
     749                 :            : 
     750         [ +  + ]:        637 :                 if (referenced_page || referenced_ptes > 1)
     751                 :            :                         return PAGEREF_ACTIVATE;
     752                 :            : 
     753                 :            :                 /*
     754                 :            :                  * Activate file-backed executable pages after first usage.
     755                 :            :                  */
     756         [ -  + ]:        421 :                 if (vm_flags & VM_EXEC)
     757                 :            :                         return PAGEREF_ACTIVATE;
     758                 :            : 
     759                 :            :                 return PAGEREF_KEEP;
     760                 :            :         }
     761                 :            : 
     762                 :            :         /* Reclaim if clean, defer dirty pages to writeback */
     763 [ +  + ][ -  + ]:     150452 :         if (referenced_page && !PageSwapBacked(page))
     764                 :            :                 return PAGEREF_RECLAIM_CLEAN;
     765                 :            : 
     766                 :            :         return PAGEREF_RECLAIM;
     767                 :            : }
     768                 :            : 
     769                 :            : /* Check if a page is dirty or under writeback */
     770                 :          0 : static void page_check_dirty_writeback(struct page *page,
     771                 :            :                                        bool *dirty, bool *writeback)
     772                 :            : {
     773                 :            :         struct address_space *mapping;
     774                 :            : 
     775                 :            :         /*
     776                 :            :          * Anonymous pages are not handled by flushers and must be written
     777                 :            :          * from reclaim context. Do not stall reclaim based on them
     778                 :            :          */
     779         [ -  + ]:     151281 :         if (!page_is_file_cache(page)) {
     780                 :          0 :                 *dirty = false;
     781                 :          0 :                 *writeback = false;
     782                 :          0 :                 return;
     783                 :            :         }
     784                 :            : 
     785                 :            :         /* By default assume that the page flags are accurate */
     786                 :     151281 :         *dirty = PageDirty(page);
     787                 :     151281 :         *writeback = PageWriteback(page);
     788                 :            : 
     789                 :            :         /* Verify dirty/writeback state if the filesystem supports it */
     790         [ +  + ]:     151281 :         if (!page_has_private(page))
     791                 :            :                 return;
     792                 :            : 
     793                 :     119006 :         mapping = page_mapping(page);
     794 [ +  + ][ +  + ]:     119006 :         if (mapping && mapping->a_ops->is_dirty_writeback)
     795                 :      64954 :                 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
     796                 :            : }
     797                 :            : 
     798                 :            : /*
     799                 :            :  * shrink_page_list() returns the number of reclaimed pages
     800                 :            :  */
     801                 :          0 : static unsigned long shrink_page_list(struct list_head *page_list,
     802                 :            :                                       struct zone *zone,
     803                 :     151089 :                                       struct scan_control *sc,
     804                 :            :                                       enum ttu_flags ttu_flags,
     805                 :            :                                       unsigned long *ret_nr_dirty,
     806                 :            :                                       unsigned long *ret_nr_unqueued_dirty,
     807                 :            :                                       unsigned long *ret_nr_congested,
     808                 :            :                                       unsigned long *ret_nr_writeback,
     809                 :            :                                       unsigned long *ret_nr_immediate,
     810                 :            :                                       bool force_reclaim)
     811                 :            : {
     812                 :       7503 :         LIST_HEAD(ret_pages);
     813                 :       7503 :         LIST_HEAD(free_pages);
     814                 :            :         int pgactivate = 0;
     815                 :            :         unsigned long nr_unqueued_dirty = 0;
     816                 :            :         unsigned long nr_dirty = 0;
     817                 :            :         unsigned long nr_congested = 0;
     818                 :            :         unsigned long nr_reclaimed = 0;
     819                 :            :         unsigned long nr_writeback = 0;
     820                 :            :         unsigned long nr_immediate = 0;
     821                 :            : 
     822                 :       7503 :         cond_resched();
     823                 :            : 
     824                 :            :         mem_cgroup_uncharge_start();
     825         [ +  + ]:     161179 :         while (!list_empty(page_list)) {
     826                 :            :                 struct address_space *mapping;
     827                 :     300026 :                 struct page *page;
     828                 :            :                 int may_enter_fs;
     829                 :            :                 enum page_references references = PAGEREF_RECLAIM_CLEAN;
     830                 :            :                 bool dirty, writeback;
     831                 :            : 
     832                 :     153676 :                 cond_resched();
     833                 :            : 
     834                 :     153677 :                 page = lru_to_page(page_list);
     835                 :            :                 list_del(&page->lru);
     836                 :            : 
     837         [ +  + ]:     153677 :                 if (!trylock_page(page))
     838                 :            :                         goto keep;
     839                 :            : 
     840                 :            :                 VM_BUG_ON(PageActive(page));
     841                 :            :                 VM_BUG_ON(page_zone(page) != zone);
     842                 :            : 
     843                 :     151282 :                 sc->nr_scanned++;
     844                 :            : 
     845         [ +  - ]:     151282 :                 if (unlikely(!page_evictable(page)))
     846                 :            :                         goto cull_mlocked;
     847                 :            : 
     848 [ -  + ][ #  # ]:     151282 :                 if (!sc->may_unmap && page_mapped(page))
     849                 :            :                         goto keep_locked;
     850                 :            : 
     851                 :            :                 /* Double the slab pressure for mapped and swapcache pages */
     852 [ +  + ][ -  + ]:     151281 :                 if (page_mapped(page) || PageSwapCache(page))
     853                 :       2183 :                         sc->nr_scanned++;
     854                 :            : 
     855 [ -  + ][ #  # ]:     151281 :                 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
     856         [ #  # ]:          0 :                         (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
     857                 :            : 
     858                 :            :                 /*
     859                 :            :                  * The number of dirty pages determines if a zone is marked
     860                 :            :                  * reclaim_congested which affects wait_iff_congested. kswapd
     861                 :            :                  * will stall and start writing pages if the tail of the LRU
     862                 :            :                  * is all dirty unqueued pages.
     863                 :            :                  */
     864                 :     151281 :                 page_check_dirty_writeback(page, &dirty, &writeback);
     865 [ +  + ][ +  + ]:     151282 :                 if (dirty || writeback)
     866                 :       1071 :                         nr_dirty++;
     867                 :            : 
     868 [ +  + ][ +  - ]:     151282 :                 if (dirty && !writeback)
     869                 :        879 :                         nr_unqueued_dirty++;
     870                 :            : 
     871                 :            :                 /*
     872                 :            :                  * Treat this page as congested if the underlying BDI is or if
     873                 :            :                  * pages are cycling through the LRU so quickly that the
     874                 :            :                  * pages marked for immediate reclaim are making it to the
     875                 :            :                  * end of the LRU a second time.
     876                 :            :                  */
     877                 :     151282 :                 mapping = page_mapping(page);
     878 [ +  + ][ +  - ]:     302562 :                 if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
                 [ +  + ]
     879         [ +  + ]:        192 :                     (writeback && PageReclaim(page)))
     880                 :        189 :                         nr_congested++;
     881                 :            : 
     882                 :            :                 /*
     883                 :            :                  * If a page at the tail of the LRU is under writeback, there
     884                 :            :                  * are three cases to consider.
     885                 :            :                  *
     886                 :            :                  * 1) If reclaim is encountering an excessive number of pages
     887                 :            :                  *    under writeback and this page is both under writeback and
     888                 :            :                  *    PageReclaim then it indicates that pages are being queued
     889                 :            :                  *    for IO but are being recycled through the LRU before the
     890                 :            :                  *    IO can complete. Waiting on the page itself risks an
     891                 :            :                  *    indefinite stall if it is impossible to writeback the
     892                 :            :                  *    page due to IO error or disconnected storage so instead
     893                 :            :                  *    note that the LRU is being scanned too quickly and the
     894                 :            :                  *    caller can stall after page list has been processed.
     895                 :            :                  *
     896                 :            :                  * 2) Global reclaim encounters a page, memcg encounters a
     897                 :            :                  *    page that is not marked for immediate reclaim or
     898                 :            :                  *    the caller does not have __GFP_IO. In this case mark
     899                 :            :                  *    the page for immediate reclaim and continue scanning.
     900                 :            :                  *
     901                 :            :                  *    __GFP_IO is checked  because a loop driver thread might
     902                 :            :                  *    enter reclaim, and deadlock if it waits on a page for
     903                 :            :                  *    which it is needed to do the write (loop masks off
     904                 :            :                  *    __GFP_IO|__GFP_FS for this reason); but more thought
     905                 :            :                  *    would probably show more reasons.
     906                 :            :                  *
     907                 :            :                  *    Don't require __GFP_FS, since we're not going into the
     908                 :            :                  *    FS, just waiting on its writeback completion. Worryingly,
     909                 :            :                  *    ext4 gfs2 and xfs allocate pages with
     910                 :            :                  *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
     911                 :            :                  *    may_enter_fs here is liable to OOM on them.
     912                 :            :                  *
     913                 :            :                  * 3) memcg encounters a page that is not already marked
     914                 :            :                  *    PageReclaim. memcg does not have any dirty pages
     915                 :            :                  *    throttling so we could easily OOM just because too many
     916                 :            :                  *    pages are in writeback and there is nothing else to
     917                 :            :                  *    reclaim. Wait for the writeback to complete.
     918                 :            :                  */
     919         [ +  + ]:     151282 :                 if (PageWriteback(page)) {
     920                 :            :                         /* Case 1 above */
     921 [ +  + ][ +  + ]:        192 :                         if (current_is_kswapd() &&
     922         [ -  + ]:        128 :                             PageReclaim(page) &&
     923                 :            :                             zone_is_reclaim_writeback(zone)) {
     924                 :          0 :                                 nr_immediate++;
     925                 :          0 :                                 goto keep_locked;
     926                 :            : 
     927                 :            :                         /* Case 2 above */
     928                 :            :                         } else if (global_reclaim(sc) ||
     929                 :            :                             !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
     930                 :            :                                 /*
     931                 :            :                                  * This is slightly racy - end_page_writeback()
     932                 :            :                                  * might have just cleared PageReclaim, then
     933                 :            :                                  * setting PageReclaim here end up interpreted
     934                 :            :                                  * as PageReadahead - but that does not matter
     935                 :            :                                  * enough to care.  What we do want is for this
     936                 :            :                                  * page to have PageReclaim set next time memcg
     937                 :            :                                  * reclaim reaches the tests above, so it will
     938                 :            :                                  * then wait_on_page_writeback() to avoid OOM;
     939                 :            :                                  * and it's also appropriate in global reclaim.
     940                 :            :                                  */
     941                 :            :                                 SetPageReclaim(page);
     942                 :        192 :                                 nr_writeback++;
     943                 :            : 
     944                 :        192 :                                 goto keep_locked;
     945                 :            : 
     946                 :            :                         /* Case 3 above */
     947                 :            :                         } else {
     948                 :            :                                 wait_on_page_writeback(page);
     949                 :            :                         }
     950                 :            :                 }
     951                 :            : 
     952         [ +  + ]:     151090 :                 if (!force_reclaim)
     953                 :     151089 :                         references = page_check_references(page, sc);
     954                 :            : 
     955      [ +  +  - ]:     151089 :                 switch (references) {
     956                 :            :                 case PAGEREF_ACTIVATE:
     957                 :            :                         goto activate_locked;
     958                 :            :                 case PAGEREF_KEEP:
     959                 :            :                         goto keep_locked;
     960                 :            :                 case PAGEREF_RECLAIM:
     961                 :            :                 case PAGEREF_RECLAIM_CLEAN:
     962                 :            :                         ; /* try to reclaim the page below */
     963                 :            :                 }
     964                 :            : 
     965                 :            :                 /*
     966                 :            :                  * Anonymous process memory has backing store?
     967                 :            :                  * Try to allocate it some swap space here.
     968                 :            :                  */
     969 [ -  + ][ #  # ]:     150452 :                 if (PageAnon(page) && !PageSwapCache(page)) {
     970         [ #  # ]:          0 :                         if (!(sc->gfp_mask & __GFP_IO))
     971                 :            :                                 goto keep_locked;
     972         [ #  # ]:          0 :                         if (!add_to_swap(page, page_list))
     973                 :            :                                 goto activate_locked;
     974                 :            :                         may_enter_fs = 1;
     975                 :            : 
     976                 :            :                         /* Adding to swap updated mapping */
     977                 :          0 :                         mapping = page_mapping(page);
     978                 :            :                 }
     979                 :            : 
     980                 :            :                 /*
     981                 :            :                  * The page is mapped into the page tables of one or more
     982                 :            :                  * processes. Try to unmap it here.
     983                 :            :                  */
     984 [ +  + ][ +  - ]:     157956 :                 if (page_mapped(page) && mapping) {
     985   [ +  -  -  - ]:       1546 :                         switch (try_to_unmap(page, ttu_flags)) {
     986                 :            :                         case SWAP_FAIL:
     987                 :            :                                 goto activate_locked;
     988                 :            :                         case SWAP_AGAIN:
     989                 :            :                                 goto keep_locked;
     990                 :            :                         case SWAP_MLOCK:
     991                 :            :                                 goto cull_mlocked;
     992                 :            :                         case SWAP_SUCCESS:
     993                 :            :                                 ; /* try to free the page below */
     994                 :            :                         }
     995                 :            :                 }
     996                 :            : 
     997         [ +  + ]:     150453 :                 if (PageDirty(page)) {
     998                 :            :                         /*
     999                 :            :                          * Only kswapd can writeback filesystem pages to
    1000                 :            :                          * avoid risk of stack overflow but only writeback
    1001                 :            :                          * if many dirty pages have been encountered.
    1002                 :            :                          */
    1003 [ +  + ][ +  + ]:        879 :                         if (page_is_file_cache(page) &&
    1004         [ +  + ]:        511 :                                         (!current_is_kswapd() ||
    1005                 :            :                                          !zone_is_reclaim_dirty(zone))) {
    1006                 :            :                                 /*
    1007                 :            :                                  * Immediately reclaim when written back.
    1008                 :            :                                  * Similar in principal to deactivate_page()
    1009                 :            :                                  * except we already have the page isolated
    1010                 :            :                                  * and know it's dirty
    1011                 :            :                                  */
    1012                 :        581 :                                 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
    1013                 :            :                                 SetPageReclaim(page);
    1014                 :            : 
    1015                 :            :                                 goto keep_locked;
    1016                 :            :                         }
    1017                 :            : 
    1018         [ +  + ]:        298 :                         if (references == PAGEREF_RECLAIM_CLEAN)
    1019                 :            :                                 goto keep_locked;
    1020         [ +  - ]:        288 :                         if (!may_enter_fs)
    1021                 :            :                                 goto keep_locked;
    1022         [ +  - ]:        288 :                         if (!sc->may_writepage)
    1023                 :            :                                 goto keep_locked;
    1024                 :            : 
    1025                 :            :                         /* Page is dirty, try to write it out here */
    1026   [ +  -  -  - ]:        288 :                         switch (pageout(page, mapping, sc)) {
    1027                 :            :                         case PAGE_KEEP:
    1028                 :            :                                 goto keep_locked;
    1029                 :            :                         case PAGE_ACTIVATE:
    1030                 :            :                                 goto activate_locked;
    1031                 :            :                         case PAGE_SUCCESS:
    1032         [ +  + ]:        288 :                                 if (PageWriteback(page))
    1033                 :            :                                         goto keep;
    1034         [ -  + ]:        273 :                                 if (PageDirty(page))
    1035                 :            :                                         goto keep;
    1036                 :            : 
    1037                 :            :                                 /*
    1038                 :            :                                  * A synchronous write - probably a ramdisk.  Go
    1039                 :            :                                  * ahead and try to reclaim the page.
    1040                 :            :                                  */
    1041         [ #  # ]:          0 :                                 if (!trylock_page(page))
    1042                 :            :                                         goto keep;
    1043 [ #  # ][ #  # ]:          0 :                                 if (PageDirty(page) || PageWriteback(page))
    1044                 :            :                                         goto keep_locked;
    1045                 :          0 :                                 mapping = page_mapping(page);
    1046                 :            :                         case PAGE_CLEAN:
    1047                 :            :                                 ; /* try to free the page below */
    1048                 :            :                         }
    1049                 :            :                 }
    1050                 :            : 
    1051                 :            :                 /*
    1052                 :            :                  * If the page has buffers, try to free the buffer mappings
    1053                 :            :                  * associated with this page. If we succeed we try to free
    1054                 :            :                  * the page as well.
    1055                 :            :                  *
    1056                 :            :                  * We do this even if the page is PageDirty().
    1057                 :            :                  * try_to_release_page() does not perform I/O, but it is
    1058                 :            :                  * possible for a page to have PageDirty set, but it is actually
    1059                 :            :                  * clean (all its buffers are clean).  This happens if the
    1060                 :            :                  * buffers were written out directly, with submit_bh(). ext3
    1061                 :            :                  * will do this, as well as the blockdev mapping.
    1062                 :            :                  * try_to_release_page() will discover that cleanness and will
    1063                 :            :                  * drop the buffers and mark the page clean - it can be freed.
    1064                 :            :                  *
    1065                 :            :                  * Rarely, pages can have buffers and no ->mapping.  These are
    1066                 :            :                  * the pages which were not successfully invalidated in
    1067                 :            :                  * truncate_complete_page().  We try to drop those buffers here
    1068                 :            :                  * and if that worked, and the page is no longer mapped into
    1069                 :            :                  * process address space (page_count == 1) it can be freed.
    1070                 :            :                  * Otherwise, leave the page on the LRU so it is swappable.
    1071                 :            :                  */
    1072         [ +  + ]:     149574 :                 if (page_has_private(page)) {
    1073         [ +  + ]:     117936 :                         if (!try_to_release_page(page, sc->gfp_mask))
    1074                 :            :                                 goto activate_locked;
    1075 [ +  + ][ +  - ]:      88769 :                         if (!mapping && page_count(page) == 1) {
    1076                 :          1 :                                 unlock_page(page);
    1077         [ -  + ]:          1 :                                 if (put_page_testzero(page))
    1078                 :            :                                         goto free_it;
    1079                 :            :                                 else {
    1080                 :            :                                         /*
    1081                 :            :                                          * rare race with speculative reference.
    1082                 :            :                                          * the speculative reference will free
    1083                 :            :                                          * this page shortly, so we may
    1084                 :            :                                          * increment nr_reclaimed here (and
    1085                 :            :                                          * leave it off the LRU).
    1086                 :            :                                          */
    1087                 :          0 :                                         nr_reclaimed++;
    1088                 :     153677 :                                         continue;
    1089                 :            :                                 }
    1090                 :            :                         }
    1091                 :            :                 }
    1092                 :            : 
    1093 [ +  - ][ +  + ]:     120405 :                 if (!mapping || !__remove_mapping(mapping, page))
    1094                 :            :                         goto keep_locked;
    1095                 :            : 
    1096                 :            :                 /*
    1097                 :            :                  * At this point, we have no other references and there is
    1098                 :            :                  * no way to pick any more up (removed from LRU, removed
    1099                 :            :                  * from pagecache). Can use non-atomic bitops now (and
    1100                 :            :                  * we obviously don't have to worry about waking up a process
    1101                 :            :                  * waiting on the page lock, because there are no references.
    1102                 :            :                  */
    1103                 :            :                 __clear_page_locked(page);
    1104                 :            : free_it:
    1105                 :     120274 :                 nr_reclaimed++;
    1106                 :            : 
    1107                 :            :                 /*
    1108                 :            :                  * Is there need to periodically free_page_list? It would
    1109                 :            :                  * appear not as the counts should be low
    1110                 :            :                  */
    1111                 :     120274 :                 list_add(&page->lru, &free_pages);
    1112                 :     120274 :                 continue;
    1113                 :            : 
    1114                 :            : cull_mlocked:
    1115         [ #  # ]:          0 :                 if (PageSwapCache(page))
    1116                 :          0 :                         try_to_free_swap(page);
    1117                 :          0 :                 unlock_page(page);
    1118                 :          0 :                 putback_lru_page(page);
    1119                 :          0 :                 continue;
    1120                 :            : 
    1121                 :            : activate_locked:
    1122                 :            :                 /* Not a candidate for swapping, so reclaim swap space. */
    1123 [ -  + ][ #  # ]:      29805 :                 if (PageSwapCache(page) && vm_swap_full())
    1124                 :          0 :                         try_to_free_swap(page);
    1125                 :            :                 VM_BUG_ON(PageActive(page));
    1126                 :            :                 SetPageActive(page);
    1127                 :      29805 :                 pgactivate++;
    1128                 :            : keep_locked:
    1129                 :      30721 :                 unlock_page(page);
    1130                 :            : keep:
    1131                 :      33403 :                 list_add(&page->lru, &ret_pages);
    1132                 :            :                 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
    1133                 :            :         }
    1134                 :            : 
    1135                 :       7503 :         free_hot_cold_page_list(&free_pages, 1);
    1136                 :            : 
    1137                 :            :         list_splice(&ret_pages, page_list);
    1138                 :            :         count_vm_events(PGACTIVATE, pgactivate);
    1139                 :            :         mem_cgroup_uncharge_end();
    1140                 :       7503 :         *ret_nr_dirty += nr_dirty;
    1141                 :       7503 :         *ret_nr_congested += nr_congested;
    1142                 :       7503 :         *ret_nr_unqueued_dirty += nr_unqueued_dirty;
    1143                 :       7503 :         *ret_nr_writeback += nr_writeback;
    1144                 :       7503 :         *ret_nr_immediate += nr_immediate;
    1145                 :       7503 :         return nr_reclaimed;
    1146                 :            : }
    1147                 :            : 
    1148                 :          0 : unsigned long reclaim_clean_pages_from_list(struct zone *zone,
    1149                 :            :                                             struct list_head *page_list)
    1150                 :            : {
    1151                 :          0 :         struct scan_control sc = {
    1152                 :            :                 .gfp_mask = GFP_KERNEL,
    1153                 :            :                 .priority = DEF_PRIORITY,
    1154                 :            :                 .may_unmap = 1,
    1155                 :            :         };
    1156                 :            :         unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
    1157                 :            :         struct page *page, *next;
    1158                 :          0 :         LIST_HEAD(clean_pages);
    1159                 :            : 
    1160         [ #  # ]:          0 :         list_for_each_entry_safe(page, next, page_list, lru) {
    1161 [ #  # ][ #  # ]:          0 :                 if (page_is_file_cache(page) && !PageDirty(page) &&
    1162                 :            :                     !isolated_balloon_page(page)) {
    1163                 :            :                         ClearPageActive(page);
    1164                 :            :                         list_move(&page->lru, &clean_pages);
    1165                 :            :                 }
    1166                 :            :         }
    1167                 :            : 
    1168                 :          0 :         ret = shrink_page_list(&clean_pages, zone, &sc,
    1169                 :            :                         TTU_UNMAP|TTU_IGNORE_ACCESS,
    1170                 :            :                         &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
    1171                 :            :         list_splice(&clean_pages, page_list);
    1172                 :          0 :         __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
    1173                 :          0 :         return ret;
    1174                 :            : }
    1175                 :            : 
    1176                 :            : /*
    1177                 :            :  * Attempt to remove the specified page from its LRU.  Only take this page
    1178                 :            :  * if it is of the appropriate PageActive status.  Pages which are being
    1179                 :            :  * freed elsewhere are also ignored.
    1180                 :            :  *
    1181                 :            :  * page:        page to consider
    1182                 :            :  * mode:        one of the LRU isolation modes defined above
    1183                 :            :  *
    1184                 :            :  * returns 0 on success, -ve errno on failure.
    1185                 :            :  */
    1186                 :          0 : int __isolate_lru_page(struct page *page, isolate_mode_t mode)
    1187                 :            : {
    1188                 :            :         int ret = -EINVAL;
    1189                 :            : 
    1190                 :            :         /* Only take pages on the LRU. */
    1191         [ +  - ]:     227812 :         if (!PageLRU(page))
    1192                 :            :                 return ret;
    1193                 :            : 
    1194                 :            :         /* Compaction should not handle unevictable pages but CMA can do so */
    1195 [ -  + ][ #  # ]:     227812 :         if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
    1196                 :            :                 return ret;
    1197                 :            : 
    1198                 :            :         ret = -EBUSY;
    1199                 :            : 
    1200                 :            :         /*
    1201                 :            :          * To minimise LRU disruption, the caller can indicate that it only
    1202                 :            :          * wants to isolate pages it will be able to operate on without
    1203                 :            :          * blocking - clean pages for the most part.
    1204                 :            :          *
    1205                 :            :          * ISOLATE_CLEAN means that only clean pages should be isolated. This
    1206                 :            :          * is used by reclaim when it is cannot write to backing storage
    1207                 :            :          *
    1208                 :            :          * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
    1209                 :            :          * that it is possible to migrate without blocking
    1210                 :            :          */
    1211         [ -  + ]:     227812 :         if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
    1212                 :            :                 /* All the caller can do on PageWriteback is block */
    1213         [ #  # ]:          0 :                 if (PageWriteback(page))
    1214                 :            :                         return ret;
    1215                 :            : 
    1216         [ #  # ]:          0 :                 if (PageDirty(page)) {
    1217                 :            :                         struct address_space *mapping;
    1218                 :            : 
    1219                 :            :                         /* ISOLATE_CLEAN means only clean pages */
    1220         [ #  # ]:          0 :                         if (mode & ISOLATE_CLEAN)
    1221                 :            :                                 return ret;
    1222                 :            : 
    1223                 :            :                         /*
    1224                 :            :                          * Only pages without mappings or that have a
    1225                 :            :                          * ->migratepage callback are possible to migrate
    1226                 :            :                          * without blocking
    1227                 :            :                          */
    1228                 :          0 :                         mapping = page_mapping(page);
    1229 [ #  # ][ #  # ]:          0 :                         if (mapping && !mapping->a_ops->migratepage)
    1230                 :            :                                 return ret;
    1231                 :            :                 }
    1232                 :            :         }
    1233                 :            : 
    1234 [ -  + ][ #  # ]:     227812 :         if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
    1235                 :            :                 return ret;
    1236                 :            : 
    1237         [ +  - ]:     227811 :         if (likely(get_page_unless_zero(page))) {
    1238                 :            :                 /*
    1239                 :            :                  * Be careful not to clear PageLRU until after we're
    1240                 :            :                  * sure the page is not being freed elsewhere -- the
    1241                 :            :                  * page release code relies on it.
    1242                 :            :                  */
    1243                 :            :                 ClearPageLRU(page);
    1244                 :            :                 ret = 0;
    1245                 :            :         }
    1246                 :            : 
    1247                 :     227812 :         return ret;
    1248                 :            : }
    1249                 :            : 
    1250                 :            : /*
    1251                 :            :  * zone->lru_lock is heavily contended.  Some of the functions that
    1252                 :            :  * shrink the lists perform better by taking out a batch of pages
    1253                 :            :  * and working on them outside the LRU lock.
    1254                 :            :  *
    1255                 :            :  * For pagecache intensive workloads, this function is the hottest
    1256                 :            :  * spot in the kernel (apart from copy_*_user functions).
    1257                 :            :  *
    1258                 :            :  * Appropriate locks must be held before calling this function.
    1259                 :            :  *
    1260                 :            :  * @nr_to_scan: The number of pages to look through on the list.
    1261                 :            :  * @lruvec:     The LRU vector to pull pages from.
    1262                 :            :  * @dst:        The temp list to put pages on to.
    1263                 :            :  * @nr_scanned: The number of pages that were scanned.
    1264                 :            :  * @sc:         The scan_control struct for this reclaim session
    1265                 :            :  * @mode:       One of the LRU isolation modes
    1266                 :            :  * @lru:        LRU list id for isolating
    1267                 :            :  *
    1268                 :            :  * returns how many pages were moved onto *@dst.
    1269                 :            :  */
    1270                 :          0 : static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
    1271                 :            :                 struct lruvec *lruvec, struct list_head *dst,
    1272                 :            :                 unsigned long *nr_scanned, struct scan_control *sc,
    1273                 :            :                 isolate_mode_t mode, enum lru_list lru)
    1274                 :            : {
    1275                 :      69794 :         struct list_head *src = &lruvec->lists[lru];
    1276                 :            :         unsigned long nr_taken = 0;
    1277                 :            :         unsigned long scan;
    1278                 :            : 
    1279 [ +  + ][ +  + ]:     297605 :         for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
    1280                 :            :                 struct page *page;
    1281                 :            :                 int nr_pages;
    1282                 :            : 
    1283                 :     227812 :                 page = lru_to_page(src);
    1284         [ +  + ]:     227812 :                 prefetchw_prev_lru_page(page, src, flags);
    1285                 :            : 
    1286                 :            :                 VM_BUG_ON(!PageLRU(page));
    1287                 :            : 
    1288      [ +  -  - ]:     227812 :                 switch (__isolate_lru_page(page, mode)) {
    1289                 :            :                 case 0:
    1290                 :            :                         nr_pages = hpage_nr_pages(page);
    1291                 :            :                         mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
    1292                 :     227811 :                         list_move(&page->lru, dst);
    1293                 :     227811 :                         nr_taken += nr_pages;
    1294                 :            :                         break;
    1295                 :            : 
    1296                 :            :                 case -EBUSY:
    1297                 :            :                         /* else it is being freed elsewhere */
    1298                 :          0 :                         list_move(&page->lru, src);
    1299                 :          0 :                         continue;
    1300                 :            : 
    1301                 :            :                 default:
    1302                 :          0 :                         BUG();
    1303                 :            :                 }
    1304                 :            :         }
    1305                 :            : 
    1306                 :      69793 :         *nr_scanned = scan;
    1307                 :      69793 :         trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
    1308                 :            :                                     nr_taken, mode, is_file_lru(lru));
    1309                 :          0 :         return nr_taken;
    1310                 :            : }
    1311                 :            : 
    1312                 :            : /**
    1313                 :            :  * isolate_lru_page - tries to isolate a page from its LRU list
    1314                 :            :  * @page: page to isolate from its LRU list
    1315                 :            :  *
    1316                 :            :  * Isolates a @page from an LRU list, clears PageLRU and adjusts the
    1317                 :            :  * vmstat statistic corresponding to whatever LRU list the page was on.
    1318                 :            :  *
    1319                 :            :  * Returns 0 if the page was removed from an LRU list.
    1320                 :            :  * Returns -EBUSY if the page was not on an LRU list.
    1321                 :            :  *
    1322                 :            :  * The returned page will have PageLRU() cleared.  If it was found on
    1323                 :            :  * the active list, it will have PageActive set.  If it was found on
    1324                 :            :  * the unevictable list, it will have the PageUnevictable bit set. That flag
    1325                 :            :  * may need to be cleared by the caller before letting the page go.
    1326                 :            :  *
    1327                 :            :  * The vmstat statistic corresponding to the list on which the page was
    1328                 :            :  * found will be decremented.
    1329                 :            :  *
    1330                 :            :  * Restrictions:
    1331                 :            :  * (1) Must be called with an elevated refcount on the page. This is a
    1332                 :            :  *     fundamentnal difference from isolate_lru_pages (which is called
    1333                 :            :  *     without a stable reference).
    1334                 :            :  * (2) the lru_lock must not be held.
    1335                 :            :  * (3) interrupts must be enabled.
    1336                 :            :  */
    1337                 :          0 : int isolate_lru_page(struct page *page)
    1338                 :            : {
    1339                 :            :         int ret = -EBUSY;
    1340                 :            : 
    1341                 :            :         VM_BUG_ON(!page_count(page));
    1342                 :            : 
    1343         [ +  - ]:       3795 :         if (PageLRU(page)) {
    1344                 :       3795 :                 struct zone *zone = page_zone(page);
    1345                 :            :                 struct lruvec *lruvec;
    1346                 :            : 
    1347                 :            :                 spin_lock_irq(&zone->lru_lock);
    1348                 :            :                 lruvec = mem_cgroup_page_lruvec(page, zone);
    1349         [ +  - ]:       3795 :                 if (PageLRU(page)) {
    1350                 :            :                         int lru = page_lru(page);
    1351                 :            :                         get_page(page);
    1352                 :            :                         ClearPageLRU(page);
    1353                 :            :                         del_page_from_lru_list(page, lruvec, lru);
    1354                 :            :                         ret = 0;
    1355                 :            :                 }
    1356                 :            :                 spin_unlock_irq(&zone->lru_lock);
    1357                 :            :         }
    1358                 :       3795 :         return ret;
    1359                 :            : }
    1360                 :            : 
    1361                 :            : /*
    1362                 :            :  * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
    1363                 :            :  * then get resheduled. When there are massive number of tasks doing page
    1364                 :            :  * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
    1365                 :            :  * the LRU list will go small and be scanned faster than necessary, leading to
    1366                 :            :  * unnecessary swapping, thrashing and OOM.
    1367                 :            :  */
    1368                 :      58835 : static int too_many_isolated(struct zone *zone, int file,
    1369                 :            :                 struct scan_control *sc)
    1370                 :            : {
    1371                 :            :         unsigned long inactive, isolated;
    1372                 :            : 
    1373            [ + ]:      58835 :         if (current_is_kswapd())
    1374                 :            :                 return 0;
    1375                 :            : 
    1376                 :            :         if (!global_reclaim(sc))
    1377                 :            :                 return 0;
    1378                 :            : 
    1379         [ +  - ]:     106835 :         if (file) {
    1380                 :            :                 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
    1381                 :            :                 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
    1382                 :            :         } else {
    1383                 :            :                 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
    1384                 :            :                 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
    1385                 :            :         }
    1386                 :            : 
    1387                 :            :         /*
    1388                 :            :          * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
    1389                 :            :          * won't get blocked by normal direct-reclaimers, forming a circular
    1390                 :            :          * deadlock.
    1391                 :            :          */
    1392         [ +  + ]:      48000 :         if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
    1393                 :      47997 :                 inactive >>= 3;
    1394                 :            : 
    1395                 :      48000 :         return isolated > inactive;
    1396                 :            : }
    1397                 :            : 
    1398                 :            : static noinline_for_stack void
    1399                 :          0 : putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
    1400                 :            : {
    1401                 :            :         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
    1402                 :            :         struct zone *zone = lruvec_zone(lruvec);
    1403                 :       7503 :         LIST_HEAD(pages_to_free);
    1404                 :            : 
    1405                 :            :         /*
    1406                 :            :          * Put back any unfreeable pages.
    1407                 :            :          */
    1408         [ +  + ]:      40906 :         while (!list_empty(page_list)) {
    1409                 :      33403 :                 struct page *page = lru_to_page(page_list);
    1410                 :            :                 int lru;
    1411                 :            : 
    1412                 :            :                 VM_BUG_ON(PageLRU(page));
    1413                 :            :                 list_del(&page->lru);
    1414         [ -  + ]:      33403 :                 if (unlikely(!page_evictable(page))) {
    1415                 :            :                         spin_unlock_irq(&zone->lru_lock);
    1416                 :          0 :                         putback_lru_page(page);
    1417                 :            :                         spin_lock_irq(&zone->lru_lock);
    1418                 :          0 :                         continue;
    1419                 :            :                 }
    1420                 :            : 
    1421                 :            :                 lruvec = mem_cgroup_page_lruvec(page, zone);
    1422                 :            : 
    1423                 :            :                 SetPageLRU(page);
    1424                 :            :                 lru = page_lru(page);
    1425                 :            :                 add_page_to_lru_list(page, lruvec, lru);
    1426                 :            : 
    1427         [ +  + ]:      33403 :                 if (is_active_lru(lru)) {
    1428                 :            :                         int file = is_file_lru(lru);
    1429                 :            :                         int numpages = hpage_nr_pages(page);
    1430                 :      29805 :                         reclaim_stat->recent_rotated[file] += numpages;
    1431                 :            :                 }
    1432         [ -  + ]:      33403 :                 if (put_page_testzero(page)) {
    1433                 :            :                         __ClearPageLRU(page);
    1434                 :            :                         __ClearPageActive(page);
    1435                 :            :                         del_page_from_lru_list(page, lruvec, lru);
    1436                 :            : 
    1437         [ #  # ]:          0 :                         if (unlikely(PageCompound(page))) {
    1438                 :            :                                 spin_unlock_irq(&zone->lru_lock);
    1439                 :          0 :                                 (*get_compound_page_dtor(page))(page);
    1440                 :            :                                 spin_lock_irq(&zone->lru_lock);
    1441                 :            :                         } else
    1442                 :            :                                 list_add(&page->lru, &pages_to_free);
    1443                 :            :                 }
    1444                 :            :         }
    1445                 :            : 
    1446                 :            :         /*
    1447                 :            :          * To save our caller's stack, now use input list for pages to free.
    1448                 :            :          */
    1449                 :            :         list_splice(&pages_to_free, page_list);
    1450                 :       7503 : }
    1451                 :            : 
    1452                 :            : /*
    1453                 :            :  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
    1454                 :            :  * of reclaimed pages
    1455                 :            :  */
    1456                 :            : static noinline_for_stack unsigned long
    1457                 :          0 : shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
    1458                 :            :                      struct scan_control *sc, enum lru_list lru)
    1459                 :            : {
    1460                 :      58834 :         LIST_HEAD(page_list);
    1461                 :            :         unsigned long nr_scanned;
    1462                 :            :         unsigned long nr_reclaimed = 0;
    1463                 :            :         unsigned long nr_taken;
    1464                 :      58834 :         unsigned long nr_dirty = 0;
    1465                 :      58834 :         unsigned long nr_congested = 0;
    1466                 :      58834 :         unsigned long nr_unqueued_dirty = 0;
    1467                 :      58834 :         unsigned long nr_writeback = 0;
    1468                 :      58834 :         unsigned long nr_immediate = 0;
    1469                 :            :         isolate_mode_t isolate_mode = 0;
    1470                 :            :         int file = is_file_lru(lru);
    1471                 :      58834 :         struct zone *zone = lruvec_zone(lruvec);
    1472                 :            :         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
    1473                 :            : 
    1474         [ -  + ]:      58834 :         while (unlikely(too_many_isolated(zone, file, sc))) {
    1475                 :          0 :                 congestion_wait(BLK_RW_ASYNC, HZ/10);
    1476                 :            : 
    1477                 :            :                 /* We are about to die and free our memory. Return now. */
    1478         [ #  # ]:          0 :                 if (fatal_signal_pending(current))
    1479                 :            :                         return SWAP_CLUSTER_MAX;
    1480                 :            :         }
    1481                 :            : 
    1482                 :      58839 :         lru_add_drain();
    1483                 :            : 
    1484         [ -  + ]:      58843 :         if (!sc->may_unmap)
    1485                 :            :                 isolate_mode |= ISOLATE_UNMAPPED;
    1486         [ -  + ]:      58843 :         if (!sc->may_writepage)
    1487                 :          0 :                 isolate_mode |= ISOLATE_CLEAN;
    1488                 :            : 
    1489                 :            :         spin_lock_irq(&zone->lru_lock);
    1490                 :            : 
    1491                 :      58846 :         nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
    1492                 :            :                                      &nr_scanned, sc, isolate_mode, lru);
    1493                 :            : 
    1494                 :      58843 :         __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
    1495                 :      58845 :         __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
    1496                 :            : 
    1497                 :            :         if (global_reclaim(sc)) {
    1498                 :      58845 :                 zone->pages_scanned += nr_scanned;
    1499         [ +  + ]:      58845 :                 if (current_is_kswapd())
    1500                 :      10842 :                         __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
    1501                 :            :                 else
    1502                 :      48003 :                         __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
    1503                 :            :         }
    1504                 :            :         spin_unlock_irq(&zone->lru_lock);
    1505                 :            : 
    1506         [ +  + ]:      58846 :         if (nr_taken == 0)
    1507                 :            :                 return 0;
    1508                 :            : 
    1509                 :       7503 :         nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
    1510                 :            :                                 &nr_dirty, &nr_unqueued_dirty, &nr_congested,
    1511                 :            :                                 &nr_writeback, &nr_immediate,
    1512                 :            :                                 false);
    1513                 :            : 
    1514                 :            :         spin_lock_irq(&zone->lru_lock);
    1515                 :            : 
    1516                 :       7503 :         reclaim_stat->recent_scanned[file] += nr_taken;
    1517                 :            : 
    1518                 :            :         if (global_reclaim(sc)) {
    1519         [ +  + ]:       7503 :                 if (current_is_kswapd())
    1520                 :       7000 :                         __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
    1521                 :            :                                                nr_reclaimed);
    1522                 :            :                 else
    1523                 :        503 :                         __count_zone_vm_events(PGSTEAL_DIRECT, zone,
    1524                 :            :                                                nr_reclaimed);
    1525                 :            :         }
    1526                 :            : 
    1527                 :       7503 :         putback_inactive_pages(lruvec, &page_list);
    1528                 :            : 
    1529                 :       7503 :         __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
    1530                 :            : 
    1531                 :            :         spin_unlock_irq(&zone->lru_lock);
    1532                 :            : 
    1533                 :       7503 :         free_hot_cold_page_list(&page_list, 1);
    1534                 :            : 
    1535                 :            :         /*
    1536                 :            :          * If reclaim is isolating dirty pages under writeback, it implies
    1537                 :            :          * that the long-lived page allocation rate is exceeding the page
    1538                 :            :          * laundering rate. Either the global limits are not being effective
    1539                 :            :          * at throttling processes due to the page distribution throughout
    1540                 :            :          * zones or there is heavy usage of a slow backing device. The
    1541                 :            :          * only option is to throttle from reclaim context which is not ideal
    1542                 :            :          * as there is no guarantee the dirtying process is throttled in the
    1543                 :            :          * same way balance_dirty_pages() manages.
    1544                 :            :          *
    1545                 :            :          * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
    1546                 :            :          * of pages under pages flagged for immediate reclaim and stall if any
    1547                 :            :          * are encountered in the nr_immediate check below.
    1548                 :            :          */
    1549 [ +  + ][ +  + ]:       7503 :         if (nr_writeback && nr_writeback == nr_taken)
    1550                 :            :                 zone_set_flag(zone, ZONE_WRITEBACK);
    1551                 :            : 
    1552                 :            :         /*
    1553                 :            :          * memcg will stall in page writeback so only consider forcibly
    1554                 :            :          * stalling for global reclaim
    1555                 :            :          */
    1556                 :            :         if (global_reclaim(sc)) {
    1557                 :            :                 /*
    1558                 :            :                  * Tag a zone as congested if all the dirty pages scanned were
    1559                 :            :                  * backed by a congested BDI and wait_iff_congested will stall.
    1560                 :            :                  */
    1561 [ +  + ][ +  + ]:       7503 :                 if (nr_dirty && nr_dirty == nr_congested)
    1562                 :            :                         zone_set_flag(zone, ZONE_CONGESTED);
    1563                 :            : 
    1564                 :            :                 /*
    1565                 :            :                  * If dirty pages are scanned that are not queued for IO, it
    1566                 :            :                  * implies that flushers are not keeping up. In this case, flag
    1567                 :            :                  * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
    1568                 :            :                  * pages from reclaim context. It will forcibly stall in the
    1569                 :            :                  * next check.
    1570                 :            :                  */
    1571         [ +  + ]:       7503 :                 if (nr_unqueued_dirty == nr_taken)
    1572                 :            :                         zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
    1573                 :            : 
    1574                 :            :                 /*
    1575                 :            :                  * In addition, if kswapd scans pages marked marked for
    1576                 :            :                  * immediate reclaim and under writeback (nr_immediate), it
    1577                 :            :                  * implies that pages are cycling through the LRU faster than
    1578                 :            :                  * they are written so also forcibly stall.
    1579                 :            :                  */
    1580    [ +  + ][ + ]:       7503 :                 if (nr_unqueued_dirty == nr_taken || nr_immediate)
    1581                 :          0 :                         congestion_wait(BLK_RW_ASYNC, HZ/10);
    1582                 :            :         }
    1583                 :            : 
    1584                 :            :         /*
    1585                 :            :          * Stall direct reclaim for IO completions if underlying BDIs or zone
    1586                 :            :          * is congested. Allow kswapd to continue until it starts encountering
    1587                 :            :          * unqueued dirty pages or cycling through the LRU too quickly.
    1588                 :            :          */
    1589 [ +  - ][ +  + ]:      66337 :         if (!sc->hibernation_mode && !current_is_kswapd())
    1590                 :        503 :                 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
    1591                 :            : 
    1592         [ -  + ]:       7503 :         trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
    1593                 :       7503 :                 zone_idx(zone),
    1594                 :            :                 nr_scanned, nr_reclaimed,
    1595                 :            :                 sc->priority,
    1596                 :            :                 trace_shrink_flags(file));
    1597                 :       7503 :         return nr_reclaimed;
    1598                 :            : }
    1599                 :            : 
    1600                 :            : /*
    1601                 :            :  * This moves pages from the active list to the inactive list.
    1602                 :            :  *
    1603                 :            :  * We move them the other way if the page is referenced by one or more
    1604                 :            :  * processes, from rmap.
    1605                 :            :  *
    1606                 :            :  * If the pages are mostly unmapped, the processing is fast and it is
    1607                 :            :  * appropriate to hold zone->lru_lock across the whole operation.  But if
    1608                 :            :  * the pages are mapped, the processing is slow (page_referenced()) so we
    1609                 :            :  * should drop zone->lru_lock around each page.  It's impossible to balance
    1610                 :            :  * this, so instead we remove the pages from the LRU while processing them.
    1611                 :            :  * It is safe to rely on PG_active against the non-LRU pages in here because
    1612                 :            :  * nobody will play with that bit on a non-LRU page.
    1613                 :            :  *
    1614                 :            :  * The downside is that we have to touch page->_count against each page.
    1615                 :            :  * But we had to alter page->flags anyway.
    1616                 :            :  */
    1617                 :            : 
    1618                 :          0 : static void move_active_pages_to_lru(struct lruvec *lruvec,
    1619                 :            :                                      struct list_head *list,
    1620                 :            :                                      struct list_head *pages_to_free,
    1621                 :            :                                      enum lru_list lru)
    1622                 :            : {
    1623                 :      21916 :         struct zone *zone = lruvec_zone(lruvec);
    1624                 :            :         unsigned long pgmoved = 0;
    1625                 :          0 :         struct page *page;
    1626                 :            :         int nr_pages;
    1627                 :            : 
    1628         [ +  + ]:      96051 :         while (!list_empty(list)) {
    1629                 :      74135 :                 page = lru_to_page(list);
    1630                 :            :                 lruvec = mem_cgroup_page_lruvec(page, zone);
    1631                 :            : 
    1632                 :            :                 VM_BUG_ON(PageLRU(page));
    1633                 :            :                 SetPageLRU(page);
    1634                 :            : 
    1635                 :            :                 nr_pages = hpage_nr_pages(page);
    1636                 :            :                 mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
    1637                 :      74135 :                 list_move(&page->lru, &lruvec->lists[lru]);
    1638                 :      74135 :                 pgmoved += nr_pages;
    1639                 :            : 
    1640         [ -  + ]:      74135 :                 if (put_page_testzero(page)) {
    1641                 :            :                         __ClearPageLRU(page);
    1642                 :            :                         __ClearPageActive(page);
    1643                 :            :                         del_page_from_lru_list(page, lruvec, lru);
    1644                 :            : 
    1645         [ #  # ]:          0 :                         if (unlikely(PageCompound(page))) {
    1646                 :            :                                 spin_unlock_irq(&zone->lru_lock);
    1647                 :          0 :                                 (*get_compound_page_dtor(page))(page);
    1648                 :            :                                 spin_lock_irq(&zone->lru_lock);
    1649                 :            :                         } else
    1650                 :            :                                 list_add(&page->lru, pages_to_free);
    1651                 :            :                 }
    1652                 :            :         }
    1653                 :      21916 :         __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
    1654         [ +  + ]:      21915 :         if (!is_active_lru(lru))
    1655                 :            :                 __count_vm_events(PGDEACTIVATE, pgmoved);
    1656                 :      21915 : }
    1657                 :            : 
    1658                 :          0 : static void shrink_active_list(unsigned long nr_to_scan,
    1659                 :            :                                struct lruvec *lruvec,
    1660                 :            :                                struct scan_control *sc,
    1661                 :            :                                enum lru_list lru)
    1662                 :            : {
    1663                 :            :         unsigned long nr_taken;
    1664                 :            :         unsigned long nr_scanned;
    1665                 :            :         unsigned long vm_flags;
    1666                 :      10958 :         LIST_HEAD(l_hold);      /* The pages which were snipped off */
    1667                 :      10958 :         LIST_HEAD(l_active);
    1668                 :      10958 :         LIST_HEAD(l_inactive);
    1669                 :          0 :         struct page *page;
    1670                 :            :         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
    1671                 :            :         unsigned long nr_rotated = 0;
    1672                 :            :         isolate_mode_t isolate_mode = 0;
    1673                 :            :         int file = is_file_lru(lru);
    1674                 :      10958 :         struct zone *zone = lruvec_zone(lruvec);
    1675                 :            : 
    1676                 :      10958 :         lru_add_drain();
    1677                 :            : 
    1678         [ -  + ]:      10958 :         if (!sc->may_unmap)
    1679                 :            :                 isolate_mode |= ISOLATE_UNMAPPED;
    1680         [ -  + ]:      10958 :         if (!sc->may_writepage)
    1681                 :          0 :                 isolate_mode |= ISOLATE_CLEAN;
    1682                 :            : 
    1683                 :            :         spin_lock_irq(&zone->lru_lock);
    1684                 :            : 
    1685                 :      10958 :         nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
    1686                 :            :                                      &nr_scanned, sc, isolate_mode, lru);
    1687                 :            :         if (global_reclaim(sc))
    1688                 :      10958 :                 zone->pages_scanned += nr_scanned;
    1689                 :            : 
    1690                 :      10958 :         reclaim_stat->recent_scanned[file] += nr_taken;
    1691                 :            : 
    1692                 :      10958 :         __count_zone_vm_events(PGREFILL, zone, nr_scanned);
    1693                 :      10958 :         __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
    1694                 :      10958 :         __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
    1695                 :            :         spin_unlock_irq(&zone->lru_lock);
    1696                 :            : 
    1697         [ +  + ]:      85090 :         while (!list_empty(&l_hold)) {
    1698                 :      74134 :                 cond_resched();
    1699                 :      74135 :                 page = lru_to_page(&l_hold);
    1700                 :            :                 list_del(&page->lru);
    1701                 :            : 
    1702         [ -  + ]:      74135 :                 if (unlikely(!page_evictable(page))) {
    1703                 :          0 :                         putback_lru_page(page);
    1704                 :          0 :                         continue;
    1705                 :            :                 }
    1706                 :            : 
    1707         [ -  + ]:      74135 :                 if (unlikely(buffer_heads_over_limit)) {
    1708   [ #  #  #  # ]:          0 :                         if (page_has_private(page) && trylock_page(page)) {
    1709         [ #  # ]:          0 :                                 if (page_has_private(page))
    1710                 :          0 :                                         try_to_release_page(page, 0);
    1711                 :          0 :                                 unlock_page(page);
    1712                 :            :                         }
    1713                 :            :                 }
    1714                 :            : 
    1715         [ +  + ]:      74135 :                 if (page_referenced(page, 0, sc->target_mem_cgroup,
    1716                 :            :                                     &vm_flags)) {
    1717                 :       1051 :                         nr_rotated += hpage_nr_pages(page);
    1718                 :            :                         /*
    1719                 :            :                          * Identify referenced, file-backed active pages and
    1720                 :            :                          * give them one more trip around the active list. So
    1721                 :            :                          * that executable code get better chances to stay in
    1722                 :            :                          * memory under moderate memory pressure.  Anon pages
    1723                 :            :                          * are not likely to be evicted by use-once streaming
    1724                 :            :                          * IO, plus JVM can create lots of anon VM_EXEC pages,
    1725                 :            :                          * so we ignore them here.
    1726                 :            :                          */
    1727 [ +  + ][ +  - ]:       1051 :                         if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
    1728                 :       1026 :                                 list_add(&page->lru, &l_active);
    1729                 :       1026 :                                 continue;
    1730                 :            :                         }
    1731                 :            :                 }
    1732                 :            : 
    1733                 :            :                 ClearPageActive(page);  /* we are de-activating */
    1734                 :      73109 :                 list_add(&page->lru, &l_inactive);
    1735                 :            :         }
    1736                 :            : 
    1737                 :            :         /*
    1738                 :            :          * Move pages back to the lru list.
    1739                 :            :          */
    1740                 :            :         spin_lock_irq(&zone->lru_lock);
    1741                 :            :         /*
    1742                 :            :          * Count referenced pages from currently used mappings as rotated,
    1743                 :            :          * even though only some of them are actually re-activated.  This
    1744                 :            :          * helps balance scan pressure between file and anonymous pages in
    1745                 :            :          * get_scan_ratio.
    1746                 :            :          */
    1747                 :      10957 :         reclaim_stat->recent_rotated[file] += nr_rotated;
    1748                 :            : 
    1749                 :      10957 :         move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
    1750                 :      10958 :         move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
    1751                 :      10958 :         __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
    1752                 :            :         spin_unlock_irq(&zone->lru_lock);
    1753                 :            : 
    1754                 :      10958 :         free_hot_cold_page_list(&l_hold, 1);
    1755                 :      10958 : }
    1756                 :            : 
    1757                 :            : #ifdef CONFIG_SWAP
    1758                 :            : static int inactive_anon_is_low_global(struct zone *zone)
    1759                 :            : {
    1760                 :            :         unsigned long active, inactive;
    1761                 :            : 
    1762                 :            :         active = zone_page_state(zone, NR_ACTIVE_ANON);
    1763                 :            :         inactive = zone_page_state(zone, NR_INACTIVE_ANON);
    1764                 :            : 
    1765 [ #  # ][ #  # ]:          0 :         if (inactive * zone->inactive_ratio < active)
                 [ #  # ]
    1766                 :            :                 return 1;
    1767                 :            : 
    1768                 :            :         return 0;
    1769                 :            : }
    1770                 :            : 
    1771                 :            : /**
    1772                 :            :  * inactive_anon_is_low - check if anonymous pages need to be deactivated
    1773                 :            :  * @lruvec: LRU vector to check
    1774                 :            :  *
    1775                 :            :  * Returns true if the zone does not have enough inactive anon pages,
    1776                 :            :  * meaning some active anon pages need to be deactivated.
    1777                 :            :  */
    1778                 :            : static int inactive_anon_is_low(struct lruvec *lruvec)
    1779                 :            : {
    1780                 :            :         /*
    1781                 :            :          * If we don't have swap space, anonymous page deactivation
    1782                 :            :          * is pointless.
    1783                 :            :          */
    1784   [ #  #  -  + ]:     203477 :         if (!total_swap_pages)
                 [ #  # ]
    1785                 :            :                 return 0;
    1786                 :            : 
    1787                 :            :         if (!mem_cgroup_disabled())
    1788                 :            :                 return mem_cgroup_inactive_anon_is_low(lruvec);
    1789                 :            : 
    1790                 :            :         return inactive_anon_is_low_global(lruvec_zone(lruvec));
    1791                 :            : }
    1792                 :            : #else
    1793                 :            : static inline int inactive_anon_is_low(struct lruvec *lruvec)
    1794                 :            : {
    1795                 :            :         return 0;
    1796                 :            : }
    1797                 :            : #endif
    1798                 :            : 
    1799                 :            : /**
    1800                 :            :  * inactive_file_is_low - check if file pages need to be deactivated
    1801                 :            :  * @lruvec: LRU vector to check
    1802                 :            :  *
    1803                 :            :  * When the system is doing streaming IO, memory pressure here
    1804                 :            :  * ensures that active file pages get deactivated, until more
    1805                 :            :  * than half of the file pages are on the inactive list.
    1806                 :            :  *
    1807                 :            :  * Once we get to that situation, protect the system's working
    1808                 :            :  * set from being evicted by disabling active file page aging.
    1809                 :            :  *
    1810                 :            :  * This uses a different ratio than the anonymous pages, because
    1811                 :            :  * the page cache uses a use-once replacement algorithm.
    1812                 :            :  */
    1813                 :            : static int inactive_file_is_low(struct lruvec *lruvec)
    1814                 :            : {
    1815                 :            :         unsigned long inactive;
    1816                 :            :         unsigned long active;
    1817                 :            : 
    1818                 :            :         inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
    1819                 :            :         active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
    1820                 :            : 
    1821                 :      60570 :         return active > inactive;
    1822                 :            : }
    1823                 :            : 
    1824                 :          0 : static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
    1825                 :            : {
    1826         [ +  - ]:      60570 :         if (is_file_lru(lru))
    1827                 :      60570 :                 return inactive_file_is_low(lruvec);
    1828                 :            :         else
    1829                 :          0 :                 return inactive_anon_is_low(lruvec);
    1830                 :            : }
    1831                 :            : 
    1832                 :          0 : static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
    1833                 :            :                                  struct lruvec *lruvec, struct scan_control *sc)
    1834                 :            : {
    1835         [ +  + ]:     119398 :         if (is_active_lru(lru)) {
    1836         [ +  + ]:      60571 :                 if (inactive_list_is_low(lruvec, lru))
    1837                 :      10957 :                         shrink_active_list(nr_to_scan, lruvec, sc, lru);
    1838                 :            :                 return 0;
    1839                 :            :         }
    1840                 :            : 
    1841                 :      58827 :         return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
    1842                 :            : }
    1843                 :            : 
    1844                 :            : static int vmscan_swappiness(struct scan_control *sc)
    1845                 :            : {
    1846                 :            :         if (global_reclaim(sc))
    1847                 :          0 :                 return vm_swappiness;
    1848                 :            :         return mem_cgroup_swappiness(sc->target_mem_cgroup);
    1849                 :            : }
    1850                 :            : 
    1851                 :            : enum scan_balance {
    1852                 :            :         SCAN_EQUAL,
    1853                 :            :         SCAN_FRACT,
    1854                 :            :         SCAN_ANON,
    1855                 :            :         SCAN_FILE,
    1856                 :            : };
    1857                 :            : 
    1858                 :            : /*
    1859                 :            :  * Determine how aggressively the anon and file LRU lists should be
    1860                 :            :  * scanned.  The relative value of each set of LRU lists is determined
    1861                 :            :  * by looking at the fraction of the pages scanned we did rotate back
    1862                 :            :  * onto the active list instead of evict.
    1863                 :            :  *
    1864                 :            :  * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
    1865                 :            :  * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
    1866                 :            :  */
    1867                 :          0 : static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
    1868                 :            :                            unsigned long *nr)
    1869                 :            : {
    1870                 :            :         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
    1871                 :            :         u64 fraction[2];
    1872                 :            :         u64 denominator = 0;    /* gcc */
    1873                 :     202653 :         struct zone *zone = lruvec_zone(lruvec);
    1874                 :            :         unsigned long anon_prio, file_prio;
    1875                 :            :         enum scan_balance scan_balance;
    1876                 :            :         unsigned long anon, file, free;
    1877                 :            :         bool force_scan = false;
    1878                 :            :         unsigned long ap, fp;
    1879                 :            :         enum lru_list lru;
    1880                 :            : 
    1881                 :            :         /*
    1882                 :            :          * If the zone or memcg is small, nr[l] can be 0.  This
    1883                 :            :          * results in no scanning on this priority and a potential
    1884                 :            :          * priority drop.  Global direct reclaim can go to the next
    1885                 :            :          * zone and tends to have no problems. Global kswapd is for
    1886                 :            :          * zone balancing and it needs to scan a minimum amount. When
    1887                 :            :          * reclaiming for a memcg, a priority drop can cause high
    1888                 :            :          * latencies, so it's better to scan a minimum amount there as
    1889                 :            :          * well.
    1890                 :            :          */
    1891   [ +  +  +  + ]:     241029 :         if (current_is_kswapd() && !zone_reclaimable(zone))
    1892                 :            :                 force_scan = true;
    1893                 :            :         if (!global_reclaim(sc))
    1894                 :            :                 force_scan = true;
    1895                 :            : 
    1896                 :            :         /* If we have no swap space, do not bother scanning anon pages. */
    1897    [ + ][ -  + ]:     202653 :         if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
    1898                 :            :                 scan_balance = SCAN_FILE;
    1899                 :            :                 goto out;
    1900                 :            :         }
    1901                 :            : 
    1902                 :            :         /*
    1903                 :            :          * Global reclaim will swap to prevent OOM even with no
    1904                 :            :          * swappiness, but memcg users want to use this knob to
    1905                 :            :          * disable swapping for individual groups completely when
    1906                 :            :          * using the memory controller's swap limit feature would be
    1907                 :            :          * too expensive.
    1908                 :            :          */
    1909                 :            :         if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
    1910                 :            :                 scan_balance = SCAN_FILE;
    1911                 :            :                 goto out;
    1912                 :            :         }
    1913                 :            : 
    1914                 :            :         /*
    1915                 :            :          * Do not apply any pressure balancing cleverness when the
    1916                 :            :          * system is close to OOM, scan both anon and file equally
    1917                 :            :          * (unless the swappiness setting disagrees with swapping).
    1918                 :            :          */
    1919 [ #  # ][ #  # ]:          0 :         if (!sc->priority && vmscan_swappiness(sc)) {
    1920                 :            :                 scan_balance = SCAN_EQUAL;
    1921                 :            :                 goto out;
    1922                 :            :         }
    1923                 :            : 
    1924                 :          0 :         anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
    1925                 :            :                 get_lru_size(lruvec, LRU_INACTIVE_ANON);
    1926                 :          0 :         file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
    1927                 :            :                 get_lru_size(lruvec, LRU_INACTIVE_FILE);
    1928                 :            : 
    1929                 :            :         /*
    1930                 :            :          * If it's foreseeable that reclaiming the file cache won't be
    1931                 :            :          * enough to get the zone back into a desirable shape, we have
    1932                 :            :          * to swap.  Better start now and leave the - probably heavily
    1933                 :            :          * thrashing - remaining file pages alone.
    1934                 :            :          */
    1935                 :            :         if (global_reclaim(sc)) {
    1936                 :            :                 free = zone_page_state(zone, NR_FREE_PAGES);
    1937         [ #  # ]:          0 :                 if (unlikely(file + free <= high_wmark_pages(zone))) {
    1938                 :            :                         scan_balance = SCAN_ANON;
    1939                 :            :                         goto out;
    1940                 :            :                 }
    1941                 :            :         }
    1942                 :            : 
    1943                 :            :         /*
    1944                 :            :          * There is enough inactive page cache, do not reclaim
    1945                 :            :          * anything from the anonymous working set right now.
    1946                 :            :          */
    1947         [ #  # ]:          0 :         if (!inactive_file_is_low(lruvec)) {
    1948                 :            :                 scan_balance = SCAN_FILE;
    1949                 :            :                 goto out;
    1950                 :            :         }
    1951                 :            : 
    1952                 :            :         scan_balance = SCAN_FRACT;
    1953                 :            : 
    1954                 :            :         /*
    1955                 :            :          * With swappiness at 100, anonymous and file have the same priority.
    1956                 :            :          * This scanning priority is essentially the inverse of IO cost.
    1957                 :            :          */
    1958                 :          0 :         anon_prio = vmscan_swappiness(sc);
    1959                 :          0 :         file_prio = 200 - anon_prio;
    1960                 :            : 
    1961                 :            :         /*
    1962                 :            :          * OK, so we have swap space and a fair amount of page cache
    1963                 :            :          * pages.  We use the recently rotated / recently scanned
    1964                 :            :          * ratios to determine how valuable each cache is.
    1965                 :            :          *
    1966                 :            :          * Because workloads change over time (and to avoid overflow)
    1967                 :            :          * we keep these statistics as a floating average, which ends
    1968                 :            :          * up weighing recent references more than old ones.
    1969                 :            :          *
    1970                 :            :          * anon in [0], file in [1]
    1971                 :            :          */
    1972                 :            :         spin_lock_irq(&zone->lru_lock);
    1973         [ #  # ]:          0 :         if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
    1974                 :          0 :                 reclaim_stat->recent_scanned[0] /= 2;
    1975                 :          0 :                 reclaim_stat->recent_rotated[0] /= 2;
    1976                 :            :         }
    1977                 :            : 
    1978         [ #  # ]:          0 :         if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
    1979                 :          0 :                 reclaim_stat->recent_scanned[1] /= 2;
    1980                 :          0 :                 reclaim_stat->recent_rotated[1] /= 2;
    1981                 :            :         }
    1982                 :            : 
    1983                 :            :         /*
    1984                 :            :          * The amount of pressure on anon vs file pages is inversely
    1985                 :            :          * proportional to the fraction of recently scanned pages on
    1986                 :            :          * each list that were recently referenced and in active use.
    1987                 :            :          */
    1988                 :          0 :         ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
    1989                 :          0 :         ap /= reclaim_stat->recent_rotated[0] + 1;
    1990                 :            : 
    1991                 :          0 :         fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
    1992                 :          0 :         fp /= reclaim_stat->recent_rotated[1] + 1;
    1993                 :            :         spin_unlock_irq(&zone->lru_lock);
    1994                 :            : 
    1995                 :          0 :         fraction[0] = ap;
    1996                 :          0 :         fraction[1] = fp;
    1997                 :     202653 :         denominator = ap + fp + 1;
    1998                 :            : out:
    1999         [ +  + ]:    1015014 :         for_each_evictable_lru(lru) {
    2000                 :            :                 int file = is_file_lru(lru);
    2001                 :            :                 unsigned long size;
    2002                 :            :                 unsigned long scan;
    2003                 :            : 
    2004                 :            :                 size = get_lru_size(lruvec, lru);
    2005                 :     811779 :                 scan = size >> sc->priority;
    2006                 :            : 
    2007         [ +  + ]:     811779 :                 if (!scan && force_scan)
    2008                 :       5715 :                         scan = min(size, SWAP_CLUSTER_MAX);
    2009                 :            : 
    2010      [ -  +  + ]:     811779 :                 switch (scan_balance) {
    2011                 :            :                 case SCAN_EQUAL:
    2012                 :            :                         /* Scan lists relative to size */
    2013                 :            :                         break;
    2014                 :            :                 case SCAN_FRACT:
    2015                 :            :                         /*
    2016                 :            :                          * Scan types proportional to swappiness and
    2017                 :            :                          * their relative recent reclaim efficiency.
    2018                 :            :                          */
    2019                 :          0 :                         scan = div64_u64(scan * fraction[file], denominator);
    2020                 :            :                         break;
    2021                 :            :                 case SCAN_FILE:
    2022                 :            :                 case SCAN_ANON:
    2023                 :            :                         /* Scan one type exclusively */
    2024         [ +  + ]:     811834 :                         if ((scan_balance == SCAN_FILE) != file)
    2025                 :            :                                 scan = 0;
    2026                 :            :                         break;
    2027                 :            :                 default:
    2028                 :            :                         /* Look ma, no brain */
    2029                 :          0 :                         BUG();
    2030                 :            :                 }
    2031                 :     812361 :                 nr[lru] = scan;
    2032                 :            :         }
    2033                 :     203235 : }
    2034                 :            : 
    2035                 :            : /*
    2036                 :            :  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
    2037                 :            :  */
    2038                 :          0 : static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
    2039                 :            : {
    2040                 :            :         unsigned long nr[NR_LRU_LISTS];
    2041                 :            :         unsigned long targets[NR_LRU_LISTS];
    2042                 :            :         unsigned long nr_to_scan;
    2043                 :            :         enum lru_list lru;
    2044                 :            :         unsigned long nr_reclaimed = 0;
    2045                 :     203240 :         unsigned long nr_to_reclaim = sc->nr_to_reclaim;
    2046                 :            :         struct blk_plug plug;
    2047                 :            :         bool scan_adjusted = false;
    2048                 :            : 
    2049                 :     203240 :         get_scan_count(lruvec, sc, nr);
    2050                 :            : 
    2051                 :            :         /* Record the original scan target for proportional adjustments later */
    2052                 :     203407 :         memcpy(targets, nr, sizeof(nr));
    2053                 :            : 
    2054                 :     203407 :         blk_start_plug(&plug);
    2055 [ +  + ][ +  + ]:     268317 :         while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                 [ +  + ]
    2056                 :     207792 :                                         nr[LRU_INACTIVE_FILE]) {
    2057                 :            :                 unsigned long nr_anon, nr_file, percentage;
    2058                 :            :                 unsigned long nr_scanned;
    2059                 :            : 
    2060         [ +  + ]:     324137 :                 for_each_evictable_lru(lru) {
    2061         [ +  + ]:     259295 :                         if (nr[lru]) {
    2062                 :          0 :                                 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
    2063                 :          0 :                                 nr[lru] -= nr_to_scan;
    2064                 :            : 
    2065                 :     119399 :                                 nr_reclaimed += shrink_list(lru, nr_to_scan,
    2066                 :            :                                                             lruvec, sc);
    2067                 :            :                         }
    2068                 :            :                 }
    2069                 :            : 
    2070         [ +  - ]:      64842 :                 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
    2071                 :      64842 :                         continue;
    2072                 :            : 
    2073                 :            :                 /*
    2074                 :            :                  * For global direct reclaim, reclaim only the number of pages
    2075                 :            :                  * requested. Less care is taken to scan proportionally as it
    2076                 :            :                  * is more important to minimise direct reclaim stall latency
    2077                 :            :                  * than it is to properly age the LRU lists.
    2078                 :            :                  */
    2079         [ #  # ]:          0 :                 if (global_reclaim(sc) && !current_is_kswapd())
    2080                 :            :                         break;
    2081                 :            : 
    2082                 :            :                 /*
    2083                 :            :                  * For kswapd and memcg, reclaim at least the number of pages
    2084                 :            :                  * requested. Ensure that the anon and file LRUs shrink
    2085                 :            :                  * proportionally what was requested by get_scan_count(). We
    2086                 :            :                  * stop reclaiming one LRU and reduce the amount scanning
    2087                 :            :                  * proportional to the original scan target.
    2088                 :            :                  */
    2089                 :          0 :                 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
    2090                 :          0 :                 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
    2091                 :            : 
    2092         [ #  # ]:          0 :                 if (nr_file > nr_anon) {
    2093                 :          0 :                         unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
    2094                 :          0 :                                                 targets[LRU_ACTIVE_ANON] + 1;
    2095                 :            :                         lru = LRU_BASE;
    2096                 :          0 :                         percentage = nr_anon * 100 / scan_target;
    2097                 :            :                 } else {
    2098                 :          0 :                         unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
    2099                 :          0 :                                                 targets[LRU_ACTIVE_FILE] + 1;
    2100                 :            :                         lru = LRU_FILE;
    2101                 :          0 :                         percentage = nr_file * 100 / scan_target;
    2102                 :            :                 }
    2103                 :            : 
    2104                 :            :                 /* Stop scanning the smaller of the LRU */
    2105                 :          0 :                 nr[lru] = 0;
    2106                 :          0 :                 nr[lru + LRU_ACTIVE] = 0;
    2107                 :            : 
    2108                 :            :                 /*
    2109                 :            :                  * Recalculate the other LRU scan count based on its original
    2110                 :            :                  * scan target and the percentage scanning already complete
    2111                 :            :                  */
    2112         [ #  # ]:          0 :                 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
    2113                 :          0 :                 nr_scanned = targets[lru] - nr[lru];
    2114                 :          0 :                 nr[lru] = targets[lru] * (100 - percentage) / 100;
    2115                 :          0 :                 nr[lru] -= min(nr[lru], nr_scanned);
    2116                 :            : 
    2117                 :          0 :                 lru += LRU_ACTIVE;
    2118                 :          0 :                 nr_scanned = targets[lru] - nr[lru];
    2119                 :          0 :                 nr[lru] = targets[lru] * (100 - percentage) / 100;
    2120                 :      64842 :                 nr[lru] -= min(nr[lru], nr_scanned);
    2121                 :            : 
    2122                 :            :                 scan_adjusted = true;
    2123                 :            :         }
    2124                 :     203488 :         blk_finish_plug(&plug);
    2125                 :     203477 :         sc->nr_reclaimed += nr_reclaimed;
    2126                 :            : 
    2127                 :            :         /*
    2128                 :            :          * Even if we did not try to evict anon pages at all, we want to
    2129                 :            :          * rebalance the anon lru active/inactive ratio.
    2130                 :            :          */
    2131         [ -  + ]:     203477 :         if (inactive_anon_is_low(lruvec))
    2132                 :          0 :                 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
    2133                 :            :                                    sc, LRU_ACTIVE_ANON);
    2134                 :            : 
    2135                 :     203477 :         throttle_vm_writeout(sc->gfp_mask);
    2136                 :     202181 : }
    2137                 :            : 
    2138                 :            : /* Use reclaim/compaction for costly allocs or under memory pressure */
    2139                 :     202376 : static bool in_reclaim_compaction(struct scan_control *sc)
    2140                 :            : {
    2141 [ +  + ][ +  - ]:     202376 :         if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
    2142            [ + ]:     202376 :                         (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
    2143                 :          2 :                          sc->priority < DEF_PRIORITY - 2))
    2144                 :            :                 return true;
    2145                 :            : 
    2146                 :            :         return false;
    2147                 :            : }
    2148                 :            : 
    2149                 :            : /*
    2150                 :            :  * Reclaim/compaction is used for high-order allocation requests. It reclaims
    2151                 :            :  * order-0 pages before compacting the zone. should_continue_reclaim() returns
    2152                 :            :  * true if more pages should be reclaimed such that when the page allocator
    2153                 :            :  * calls try_to_compact_zone() that it will have enough free pages to succeed.
    2154                 :            :  * It will give up earlier than that if there is difficulty reclaiming pages.
    2155                 :            :  */
    2156                 :            : static inline bool should_continue_reclaim(struct zone *zone,
    2157                 :            :                                         unsigned long nr_reclaimed,
    2158                 :            :                                         unsigned long nr_scanned,
    2159                 :     202229 :                                         struct scan_control *sc)
    2160                 :            : {
    2161                 :            :         unsigned long pages_for_compaction;
    2162                 :            :         unsigned long inactive_lru_pages;
    2163                 :            : 
    2164                 :            :         /* If not in reclaim/compaction mode, stop */
    2165         [ -  + ]:     202229 :         if (!in_reclaim_compaction(sc))
    2166                 :            :                 return false;
    2167                 :            : 
    2168                 :            :         /* Consider stopping depending on scan and reclaim activity */
    2169         [ #  # ]:          0 :         if (sc->gfp_mask & __GFP_REPEAT) {
    2170                 :            :                 /*
    2171                 :            :                  * For __GFP_REPEAT allocations, stop reclaiming if the
    2172                 :            :                  * full LRU list has been scanned and we are still failing
    2173                 :            :                  * to reclaim pages. This full LRU scan is potentially
    2174                 :            :                  * expensive but a __GFP_REPEAT caller really wants to succeed
    2175                 :            :                  */
    2176         [ #  # ]:          0 :                 if (!nr_reclaimed && !nr_scanned)
    2177                 :            :                         return false;
    2178                 :            :         } else {
    2179                 :            :                 /*
    2180                 :            :                  * For non-__GFP_REPEAT allocations which can presumably
    2181                 :            :                  * fail without consequence, stop if we failed to reclaim
    2182                 :            :                  * any pages from the last SWAP_CLUSTER_MAX number of
    2183                 :            :                  * pages that were scanned. This will return to the
    2184                 :            :                  * caller faster at the risk reclaim/compaction and
    2185                 :            :                  * the resulting allocation attempt fails
    2186                 :            :                  */
    2187         [ #  # ]:          0 :                 if (!nr_reclaimed)
    2188                 :            :                         return false;
    2189                 :            :         }
    2190                 :            : 
    2191                 :            :         /*
    2192                 :            :          * If we have not reclaimed enough pages for compaction and the
    2193                 :            :          * inactive lists are large enough, continue reclaiming
    2194                 :            :          */
    2195                 :        107 :         pages_for_compaction = (2UL << sc->order);
    2196                 :            :         inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
    2197         [ -  + ]:        107 :         if (get_nr_swap_pages() > 0)
    2198                 :          0 :                 inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
    2199    [ - ][ #  # ]:        107 :         if (sc->nr_reclaimed < pages_for_compaction &&
    2200                 :            :                         inactive_lru_pages > pages_for_compaction)
    2201                 :            :                 return true;
    2202                 :            : 
    2203                 :            :         /* If compaction would go ahead or the allocation would succeed, stop */
    2204         [ #  # ]:          0 :         switch (compaction_suitable(zone, sc->order)) {
    2205                 :            :         case COMPACT_PARTIAL:
    2206                 :            :         case COMPACT_CONTINUE:
    2207                 :            :                 return false;
    2208                 :            :         default:
    2209                 :            :                 return true;
    2210                 :            :         }
    2211                 :            : }
    2212                 :            : 
    2213                 :     203303 : static void shrink_zone(struct zone *zone, struct scan_control *sc)
    2214                 :            : {
    2215                 :            :         unsigned long nr_reclaimed, nr_scanned;
    2216                 :            : 
    2217                 :            :         do {
    2218                 :            :                 struct mem_cgroup *root = sc->target_mem_cgroup;
    2219                 :            :                 struct mem_cgroup_reclaim_cookie reclaim = {
    2220                 :            :                         .zone = zone,
    2221                 :            :                         .priority = sc->priority,
    2222                 :            :                 };
    2223                 :            :                 struct mem_cgroup *memcg;
    2224                 :            : 
    2225                 :     203303 :                 nr_reclaimed = sc->nr_reclaimed;
    2226                 :     203303 :                 nr_scanned = sc->nr_scanned;
    2227                 :            : 
    2228                 :            :                 memcg = mem_cgroup_iter(root, NULL, &reclaim);
    2229                 :            :                 do {
    2230                 :            :                         struct lruvec *lruvec;
    2231                 :            : 
    2232                 :     203303 :                         lruvec = mem_cgroup_zone_lruvec(zone, memcg);
    2233                 :            : 
    2234                 :     203303 :                         shrink_lruvec(lruvec, sc);
    2235                 :            : 
    2236                 :            :                         /*
    2237                 :            :                          * Direct reclaim and kswapd have to scan all memory
    2238                 :            :                          * cgroups to fulfill the overall scan target for the
    2239                 :            :                          * zone.
    2240                 :            :                          *
    2241                 :            :                          * Limit reclaim, on the other hand, only cares about
    2242                 :            :                          * nr_to_reclaim pages to be reclaimed and it will
    2243                 :            :                          * retry with decreasing priority if one round over the
    2244                 :            :                          * whole hierarchy is not sufficient.
    2245                 :            :                          */
    2246                 :            :                         if (!global_reclaim(sc) &&
    2247                 :            :                                         sc->nr_reclaimed >= sc->nr_to_reclaim) {
    2248                 :            :                                 mem_cgroup_iter_break(root, memcg);
    2249                 :            :                                 break;
    2250                 :            :                         }
    2251                 :            :                         memcg = mem_cgroup_iter(root, memcg, &reclaim);
    2252                 :            :                 } while (memcg);
    2253                 :            : 
    2254                 :            :                 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
    2255                 :            :                            sc->nr_scanned - nr_scanned,
    2256                 :            :                            sc->nr_reclaimed - nr_reclaimed);
    2257                 :            : 
    2258                 :     404458 :         } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
    2259         [ -  + ]:     203566 :                                          sc->nr_scanned - nr_scanned, sc));
    2260                 :     203566 : }
    2261                 :            : 
    2262                 :            : /* Returns true if compaction should go ahead for a high-order request */
    2263                 :            : static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
    2264                 :            : {
    2265                 :            :         unsigned long balance_gap, watermark;
    2266                 :            :         bool watermark_ok;
    2267                 :            : 
    2268                 :            :         /* Do not consider compaction for orders reclaim is meant to satisfy */
    2269         [ -  + ]:     165322 :         if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
    2270                 :            :                 return false;
    2271                 :            : 
    2272                 :            :         /*
    2273                 :            :          * Compaction takes time to run and there are potentially other
    2274                 :            :          * callers using the pages just freed. Continue reclaiming until
    2275                 :            :          * there is a buffer of free pages available to give compaction
    2276                 :            :          * a reasonable chance of completing and allocating the page
    2277                 :            :          */
    2278                 :          0 :         balance_gap = min(low_wmark_pages(zone),
    2279                 :            :                 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
    2280                 :            :                         KSWAPD_ZONE_BALANCE_GAP_RATIO);
    2281                 :          0 :         watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
    2282                 :          0 :         watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
    2283                 :            : 
    2284                 :            :         /*
    2285                 :            :          * If compaction is deferred, reclaim up to a point where
    2286                 :            :          * compaction will have a chance of success when re-enabled
    2287                 :            :          */
    2288         [ #  # ]:          0 :         if (compaction_deferred(zone, sc->order))
    2289                 :            :                 return watermark_ok;
    2290                 :            : 
    2291                 :            :         /* If compaction is not ready to start, keep reclaiming */
    2292         [ #  # ]:          0 :         if (!compaction_suitable(zone, sc->order))
    2293                 :            :                 return false;
    2294                 :            : 
    2295                 :            :         return watermark_ok;
    2296                 :            : }
    2297                 :            : 
    2298                 :            : /*
    2299                 :            :  * This is the direct reclaim path, for page-allocating processes.  We only
    2300                 :            :  * try to reclaim pages from zones which will satisfy the caller's allocation
    2301                 :            :  * request.
    2302                 :            :  *
    2303                 :            :  * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
    2304                 :            :  * Because:
    2305                 :            :  * a) The caller may be trying to free *extra* pages to satisfy a higher-order
    2306                 :            :  *    allocation or
    2307                 :            :  * b) The target zone may be at high_wmark_pages(zone) but the lower zones
    2308                 :            :  *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
    2309                 :            :  *    zone defense algorithm.
    2310                 :            :  *
    2311                 :            :  * If a zone is deemed to be full of pinned pages then just give it a light
    2312                 :            :  * scan then give up on it.
    2313                 :            :  *
    2314                 :            :  * This function returns true if a zone is being reclaimed for a costly
    2315                 :            :  * high-order allocation and compaction is ready to begin. This indicates to
    2316                 :            :  * the caller that it should consider retrying the allocation instead of
    2317                 :            :  * further reclaim.
    2318                 :            :  */
    2319                 :          0 : static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
    2320                 :            : {
    2321                 :            :         struct zoneref *z;
    2322                 :            :         struct zone *zone;
    2323                 :            :         unsigned long nr_soft_reclaimed;
    2324                 :            :         unsigned long nr_soft_scanned;
    2325                 :            :         bool aborted_reclaim = false;
    2326                 :            : 
    2327                 :            :         /*
    2328                 :            :          * If the number of buffer_heads in the machine exceeds the maximum
    2329                 :            :          * allowed level, force direct reclaim to scan the highmem zone as
    2330                 :            :          * highmem pages could be pinning lowmem pages storing buffer_heads
    2331                 :            :          */
    2332         [ -  + ]:     132528 :         if (buffer_heads_over_limit)
    2333                 :          0 :                 sc->gfp_mask |= __GFP_HIGHMEM;
    2334                 :            : 
    2335         [ +  + ]:     264241 :         for_each_zone_zonelist_nodemask(zone, z, zonelist,
    2336                 :            :                                         gfp_zone(sc->gfp_mask), sc->nodemask) {
    2337            [ + ]:     264574 :                 if (!populated_zone(zone))
    2338                 :          0 :                         continue;
    2339                 :            :                 /*
    2340                 :            :                  * Take care memory controller reclaiming has small influence
    2341                 :            :                  * to global LRU.
    2342                 :            :                  */
    2343                 :            :                 if (global_reclaim(sc)) {
    2344                 :            :                         if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    2345                 :            :                                 continue;
    2346   [ +  +  +  + ]:     641935 :                         if (sc->priority != DEF_PRIORITY &&
    2347                 :            :                             !zone_reclaimable(zone))
    2348                 :      99657 :                                 continue;       /* Let kswapd poll it */
    2349                 :            :                         if (IS_ENABLED(CONFIG_COMPACTION)) {
    2350                 :            :                                 /*
    2351                 :            :                                  * If we already have plenty of memory free for
    2352                 :            :                                  * compaction in this zone, don't free any more.
    2353                 :            :                                  * Even though compaction is invoked for any
    2354                 :            :                                  * non-zero order, only frequent costly order
    2355                 :            :                                  * reclamation is disruptive enough to become a
    2356                 :            :                                  * noticeable problem, like transparent huge
    2357                 :            :                                  * page allocations.
    2358                 :            :                                  */
    2359         [ -  + ]:     165322 :                                 if (compaction_ready(zone, sc)) {
    2360                 :            :                                         aborted_reclaim = true;
    2361                 :          0 :                                         continue;
    2362                 :            :                                 }
    2363                 :            :                         }
    2364                 :            :                         /*
    2365                 :            :                          * This steals pages from memory cgroups over softlimit
    2366                 :            :                          * and returns the number of reclaimed pages and
    2367                 :            :                          * scanned pages. This works for global memory pressure
    2368                 :            :                          * and balancing, not for a memcg's limit.
    2369                 :            :                          */
    2370                 :            :                         nr_soft_scanned = 0;
    2371                 :            :                         nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
    2372                 :            :                                                 sc->order, sc->gfp_mask,
    2373                 :            :                                                 &nr_soft_scanned);
    2374                 :            :                         sc->nr_reclaimed += nr_soft_reclaimed;
    2375                 :            :                         sc->nr_scanned += nr_soft_scanned;
    2376                 :            :                         /* need some check for avoid more shrink_zone() */
    2377                 :            :                 }
    2378                 :            : 
    2379                 :     165322 :                 shrink_zone(zone, sc);
    2380                 :            :         }
    2381                 :            : 
    2382                 :     132565 :         return aborted_reclaim;
    2383                 :            : }
    2384                 :            : 
    2385                 :            : /* All zones in zonelist are unreclaimable? */
    2386                 :      10148 : static bool all_unreclaimable(struct zonelist *zonelist,
    2387                 :            :                 struct scan_control *sc)
    2388                 :            : {
    2389                 :            :         struct zoneref *z;
    2390                 :            :         struct zone *zone;
    2391                 :            : 
    2392         [ +  + ]:      17530 :         for_each_zone_zonelist_nodemask(zone, z, zonelist,
    2393                 :            :                         gfp_zone(sc->gfp_mask), sc->nodemask) {
    2394         [ -  + ]:      17133 :                 if (!populated_zone(zone))
    2395                 :          0 :                         continue;
    2396                 :            :                 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    2397                 :            :                         continue;
    2398         [ +  + ]:      17133 :                 if (zone_reclaimable(zone))
    2399                 :            :                         return false;
    2400                 :            :         }
    2401                 :            : 
    2402                 :            :         return true;
    2403                 :            : }
    2404                 :            : 
    2405                 :            : /*
    2406                 :            :  * This is the main entry point to direct page reclaim.
    2407                 :            :  *
    2408                 :            :  * If a full scan of the inactive list fails to free enough memory then we
    2409                 :            :  * are "out of memory" and something needs to be killed.
    2410                 :            :  *
    2411                 :            :  * If the caller is !__GFP_FS then the probability of a failure is reasonably
    2412                 :            :  * high - the zone may be full of dirty or under-writeback pages, which this
    2413                 :            :  * caller can't do much about.  We kick the writeback threads and take explicit
    2414                 :            :  * naps in the hope that some of these pages can be written.  But if the
    2415                 :            :  * allocating task holds filesystem locks which prevent writeout this might not
    2416                 :            :  * work, and the allocation attempt will fail.
    2417                 :            :  *
    2418                 :            :  * returns:     0, if no pages reclaimed
    2419                 :            :  *              else, the number of pages reclaimed
    2420                 :            :  */
    2421                 :          0 : static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
    2422                 :            :                                         struct scan_control *sc,
    2423                 :            :                                         struct shrink_control *shrink)
    2424                 :            : {
    2425                 :            :         unsigned long total_scanned = 0;
    2426                 :      10198 :         struct reclaim_state *reclaim_state = current->reclaim_state;
    2427                 :            :         struct zoneref *z;
    2428                 :            :         struct zone *zone;
    2429                 :            :         unsigned long writeback_threshold;
    2430                 :            :         bool aborted_reclaim;
    2431                 :            : 
    2432                 :            :         delayacct_freepages_start();
    2433                 :            : 
    2434                 :            :         if (global_reclaim(sc))
    2435                 :            :                 count_vm_event(ALLOCSTALL);
    2436                 :            : 
    2437                 :            :         do {
    2438                 :            :                 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
    2439                 :            :                                 sc->priority);
    2440                 :     132525 :                 sc->nr_scanned = 0;
    2441                 :     132525 :                 aborted_reclaim = shrink_zones(zonelist, sc);
    2442                 :            : 
    2443                 :            :                 /*
    2444                 :            :                  * Don't shrink slabs when reclaiming memory from over limit
    2445                 :            :                  * cgroups but do shrink slab at least once when aborting
    2446                 :            :                  * reclaim for compaction to avoid unevenly scanning file/anon
    2447                 :            :                  * LRU pages over slab pages.
    2448                 :            :                  */
    2449                 :            :                 if (global_reclaim(sc)) {
    2450                 :            :                         unsigned long lru_pages = 0;
    2451                 :            : 
    2452                 :            :                         nodes_clear(shrink->nodes_to_scan);
    2453         [ +  + ]:     397699 :                         for_each_zone_zonelist(zone, z, zonelist,
    2454                 :            :                                         gfp_zone(sc->gfp_mask)) {
    2455                 :            :                                 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    2456                 :            :                                         continue;
    2457                 :            : 
    2458                 :     265146 :                                 lru_pages += zone_reclaimable_pages(zone);
    2459                 :            :                                 node_set(zone_to_nid(zone),
    2460                 :            :                                          shrink->nodes_to_scan);
    2461                 :            :                         }
    2462                 :            : 
    2463                 :     132574 :                         shrink_slab(shrink, sc->nr_scanned, lru_pages);
    2464            [ + ]:     132574 :                         if (reclaim_state) {
    2465                 :     132581 :                                 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
    2466                 :     132581 :                                 reclaim_state->reclaimed_slab = 0;
    2467                 :            :                         }
    2468                 :            :                 }
    2469                 :     132574 :                 total_scanned += sc->nr_scanned;
    2470            [ + ]:     132574 :                 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
    2471                 :            :                         goto out;
    2472                 :            : 
    2473                 :            :                 /*
    2474                 :            :                  * If we're getting trouble reclaiming, start doing
    2475                 :            :                  * writepage even in laptop mode.
    2476                 :            :                  */
    2477         [ +  + ]:     132584 :                 if (sc->priority < DEF_PRIORITY - 2)
    2478                 :     101987 :                         sc->may_writepage = 1;
    2479                 :            : 
    2480                 :            :                 /*
    2481                 :            :                  * Try to write back as many pages as we just scanned.  This
    2482                 :            :                  * tends to cause slow streaming writers to write data to the
    2483                 :            :                  * disk smoothly, at the dirtying rate, which is nice.   But
    2484                 :            :                  * that's undesirable in laptop mode, where we *want* lumpy
    2485                 :            :                  * writeout.  So in laptop mode, write out the whole world.
    2486                 :            :                  */
    2487                 :     132584 :                 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
    2488         [ +  + ]:     132584 :                 if (total_scanned > writeback_threshold) {
    2489         [ +  - ]:         13 :                         wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
    2490                 :            :                                                 WB_REASON_TRY_TO_FREE_PAGES);
    2491                 :         13 :                         sc->may_writepage = 1;
    2492                 :            :                 }
    2493    [ +  + ][ + ]:     132584 :         } while (--sc->priority >= 0 && !aborted_reclaim);
    2494                 :            : 
    2495                 :            : out:
    2496                 :            :         delayacct_freepages_end();
    2497                 :            : 
    2498            [ + ]:      10247 :         if (sc->nr_reclaimed)
    2499                 :            :                 return sc->nr_reclaimed;
    2500                 :            : 
    2501                 :            :         /*
    2502                 :            :          * As hibernation is going on, kswapd is freezed so that it can't mark
    2503                 :            :          * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
    2504                 :            :          * check.
    2505                 :            :          */
    2506         [ +  - ]:      10148 :         if (oom_killer_disabled)
    2507                 :            :                 return 0;
    2508                 :            : 
    2509                 :            :         /* Aborted reclaim to try compaction? don't OOM, then */
    2510         [ +  - ]:      10148 :         if (aborted_reclaim)
    2511                 :            :                 return 1;
    2512                 :            : 
    2513                 :            :         /* top priority shrink_zones still had more to do? don't OOM, then */
    2514         [ +  + ]:      10148 :         if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
    2515                 :            :                 return 1;
    2516                 :            : 
    2517                 :            :         return 0;
    2518                 :            : }
    2519                 :            : 
    2520                 :          0 : static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
    2521                 :            : {
    2522                 :            :         struct zone *zone;
    2523                 :            :         unsigned long pfmemalloc_reserve = 0;
    2524                 :            :         unsigned long free_pages = 0;
    2525                 :            :         int i;
    2526                 :            :         bool wmark_ok;
    2527                 :            : 
    2528         [ +  + ]:      20397 :         for (i = 0; i <= ZONE_NORMAL; i++) {
    2529                 :      10199 :                 zone = &pgdat->node_zones[i];
    2530                 :      10199 :                 pfmemalloc_reserve += min_wmark_pages(zone);
    2531                 :      10199 :                 free_pages += zone_page_state(zone, NR_FREE_PAGES);
    2532                 :            :         }
    2533                 :            : 
    2534                 :      10198 :         wmark_ok = free_pages > pfmemalloc_reserve / 2;
    2535                 :            : 
    2536                 :            :         /* kswapd must be awake if processes are being throttled */
    2537 [ -  + ][ #  # ]:      10198 :         if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
    2538                 :          0 :                 pgdat->classzone_idx = min(pgdat->classzone_idx,
    2539                 :            :                                                 (enum zone_type)ZONE_NORMAL);
    2540                 :          0 :                 wake_up_interruptible(&pgdat->kswapd_wait);
    2541                 :            :         }
    2542                 :            : 
    2543                 :          0 :         return wmark_ok;
    2544                 :            : }
    2545                 :            : 
    2546                 :            : /*
    2547                 :            :  * Throttle direct reclaimers if backing storage is backed by the network
    2548                 :            :  * and the PFMEMALLOC reserve for the preferred node is getting dangerously
    2549                 :            :  * depleted. kswapd will continue to make progress and wake the processes
    2550                 :            :  * when the low watermark is reached.
    2551                 :            :  *
    2552                 :            :  * Returns true if a fatal signal was delivered during throttling. If this
    2553                 :            :  * happens, the page allocator should not consider triggering the OOM killer.
    2554                 :            :  */
    2555                 :          0 : static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
    2556                 :            :                                         nodemask_t *nodemask)
    2557                 :            : {
    2558                 :            :         struct zone *zone;
    2559                 :            :         int high_zoneidx = gfp_zone(gfp_mask);
    2560                 :            :         pg_data_t *pgdat;
    2561                 :            : 
    2562                 :            :         /*
    2563                 :            :          * Kernel threads should not be throttled as they may be indirectly
    2564                 :            :          * responsible for cleaning pages necessary for reclaim to make forward
    2565                 :            :          * progress. kjournald for example may enter direct reclaim while
    2566                 :            :          * committing a transaction where throttling it could forcing other
    2567                 :            :          * processes to block on log_wait_commit().
    2568                 :            :          */
    2569         [ +  + ]:      10199 :         if (current->flags & PF_KTHREAD)
    2570                 :            :                 goto out;
    2571                 :            : 
    2572                 :            :         /*
    2573                 :            :          * If a fatal signal is pending, this process should not throttle.
    2574                 :            :          * It should return quickly so it can exit and free its memory
    2575                 :            :          */
    2576         [ +  + ]:      10198 :         if (fatal_signal_pending(current))
    2577                 :            :                 goto out;
    2578                 :            : 
    2579                 :            :         /* Check if the pfmemalloc reserves are ok */
    2580                 :            :         first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
    2581                 :      10199 :         pgdat = zone->zone_pgdat;
    2582         [ -  + ]:      10199 :         if (pfmemalloc_watermark_ok(pgdat))
    2583                 :            :                 goto out;
    2584                 :            : 
    2585                 :            :         /* Account for the throttling */
    2586                 :            :         count_vm_event(PGSCAN_DIRECT_THROTTLE);
    2587                 :            : 
    2588                 :            :         /*
    2589                 :            :          * If the caller cannot enter the filesystem, it's possible that it
    2590                 :            :          * is due to the caller holding an FS lock or performing a journal
    2591                 :            :          * transaction in the case of a filesystem like ext[3|4]. In this case,
    2592                 :            :          * it is not safe to block on pfmemalloc_wait as kswapd could be
    2593                 :            :          * blocked waiting on the same lock. Instead, throttle for up to a
    2594                 :            :          * second before continuing.
    2595                 :            :          */
    2596         [ #  # ]:          0 :         if (!(gfp_mask & __GFP_FS)) {
    2597 [ #  # ][ #  # ]:          0 :                 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
         [ #  # ][ #  # ]
    2598                 :            :                         pfmemalloc_watermark_ok(pgdat), HZ);
    2599                 :            : 
    2600                 :            :                 goto check_pending;
    2601                 :            :         }
    2602                 :            : 
    2603                 :            :         /* Throttle until kswapd wakes the process */
    2604 [ #  # ][ #  # ]:          0 :         wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
                 [ #  # ]
    2605                 :            :                 pfmemalloc_watermark_ok(pgdat));
    2606                 :            : 
    2607                 :            : check_pending:
    2608         [ #  # ]:          0 :         if (fatal_signal_pending(current))
    2609                 :            :                 return true;
    2610                 :            : 
    2611                 :            : out:
    2612                 :            :         return false;
    2613                 :            : }
    2614                 :            : 
    2615                 :          0 : unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
    2616                 :            :                                 gfp_t gfp_mask, nodemask_t *nodemask)
    2617                 :            : {
    2618                 :            :         unsigned long nr_reclaimed;
    2619                 :      30591 :         struct scan_control sc = {
    2620                 :            :                 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
    2621                 :      10197 :                 .may_writepage = !laptop_mode,
    2622                 :            :                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
    2623                 :            :                 .may_unmap = 1,
    2624                 :            :                 .may_swap = 1,
    2625                 :            :                 .order = order,
    2626                 :            :                 .priority = DEF_PRIORITY,
    2627                 :            :                 .target_mem_cgroup = NULL,
    2628                 :            :                 .nodemask = nodemask,
    2629                 :            :         };
    2630                 :      10197 :         struct shrink_control shrink = {
    2631                 :            :                 .gfp_mask = sc.gfp_mask,
    2632                 :            :         };
    2633                 :            : 
    2634                 :            :         /*
    2635                 :            :          * Do not enter reclaim if fatal signal was delivered while throttled.
    2636                 :            :          * 1 is returned so that the page allocator does not OOM kill at this
    2637                 :            :          * point.
    2638                 :            :          */
    2639         [ +  + ]:      10197 :         if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
    2640                 :            :                 return 1;
    2641                 :            : 
    2642                 :      10197 :         trace_mm_vmscan_direct_reclaim_begin(order,
    2643                 :            :                                 sc.may_writepage,
    2644                 :            :                                 gfp_mask);
    2645                 :            : 
    2646                 :      10197 :         nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
    2647                 :            : 
    2648                 :            :         trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
    2649                 :            : 
    2650                 :      10198 :         return nr_reclaimed;
    2651                 :            : }
    2652                 :            : 
    2653                 :            : #ifdef CONFIG_MEMCG
    2654                 :            : 
    2655                 :            : unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
    2656                 :            :                                                 gfp_t gfp_mask, bool noswap,
    2657                 :            :                                                 struct zone *zone,
    2658                 :            :                                                 unsigned long *nr_scanned)
    2659                 :            : {
    2660                 :            :         struct scan_control sc = {
    2661                 :            :                 .nr_scanned = 0,
    2662                 :            :                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
    2663                 :            :                 .may_writepage = !laptop_mode,
    2664                 :            :                 .may_unmap = 1,
    2665                 :            :                 .may_swap = !noswap,
    2666                 :            :                 .order = 0,
    2667                 :            :                 .priority = 0,
    2668                 :            :                 .target_mem_cgroup = memcg,
    2669                 :            :         };
    2670                 :            :         struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
    2671                 :            : 
    2672                 :            :         sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
    2673                 :            :                         (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
    2674                 :            : 
    2675                 :            :         trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
    2676                 :            :                                                       sc.may_writepage,
    2677                 :            :                                                       sc.gfp_mask);
    2678                 :            : 
    2679                 :            :         /*
    2680                 :            :          * NOTE: Although we can get the priority field, using it
    2681                 :            :          * here is not a good idea, since it limits the pages we can scan.
    2682                 :            :          * if we don't reclaim here, the shrink_zone from balance_pgdat
    2683                 :            :          * will pick up pages from other mem cgroup's as well. We hack
    2684                 :            :          * the priority and make it zero.
    2685                 :            :          */
    2686                 :            :         shrink_lruvec(lruvec, &sc);
    2687                 :            : 
    2688                 :            :         trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
    2689                 :            : 
    2690                 :            :         *nr_scanned = sc.nr_scanned;
    2691                 :            :         return sc.nr_reclaimed;
    2692                 :            : }
    2693                 :            : 
    2694                 :            : unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
    2695                 :            :                                            gfp_t gfp_mask,
    2696                 :            :                                            bool noswap)
    2697                 :            : {
    2698                 :            :         struct zonelist *zonelist;
    2699                 :            :         unsigned long nr_reclaimed;
    2700                 :            :         int nid;
    2701                 :            :         struct scan_control sc = {
    2702                 :            :                 .may_writepage = !laptop_mode,
    2703                 :            :                 .may_unmap = 1,
    2704                 :            :                 .may_swap = !noswap,
    2705                 :            :                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
    2706                 :            :                 .order = 0,
    2707                 :            :                 .priority = DEF_PRIORITY,
    2708                 :            :                 .target_mem_cgroup = memcg,
    2709                 :            :                 .nodemask = NULL, /* we don't care the placement */
    2710                 :            :                 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
    2711                 :            :                                 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
    2712                 :            :         };
    2713                 :            :         struct shrink_control shrink = {
    2714                 :            :                 .gfp_mask = sc.gfp_mask,
    2715                 :            :         };
    2716                 :            : 
    2717                 :            :         /*
    2718                 :            :          * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
    2719                 :            :          * take care of from where we get pages. So the node where we start the
    2720                 :            :          * scan does not need to be the current node.
    2721                 :            :          */
    2722                 :            :         nid = mem_cgroup_select_victim_node(memcg);
    2723                 :            : 
    2724                 :            :         zonelist = NODE_DATA(nid)->node_zonelists;
    2725                 :            : 
    2726                 :            :         trace_mm_vmscan_memcg_reclaim_begin(0,
    2727                 :            :                                             sc.may_writepage,
    2728                 :            :                                             sc.gfp_mask);
    2729                 :            : 
    2730                 :            :         nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
    2731                 :            : 
    2732                 :            :         trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
    2733                 :            : 
    2734                 :            :         return nr_reclaimed;
    2735                 :            : }
    2736                 :            : #endif
    2737                 :            : 
    2738                 :          0 : static void age_active_anon(struct zone *zone, struct scan_control *sc)
    2739                 :            : {
    2740                 :            :         struct mem_cgroup *memcg;
    2741                 :            : 
    2742         [ -  + ]:      26045 :         if (!total_swap_pages)
    2743                 :          0 :                 return;
    2744                 :            : 
    2745                 :            :         memcg = mem_cgroup_iter(NULL, NULL, NULL);
    2746                 :            :         do {
    2747                 :          0 :                 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
    2748                 :            : 
    2749            [ - ]:          0 :                 if (inactive_anon_is_low(lruvec))
    2750                 :          0 :                         shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
    2751                 :            :                                            sc, LRU_ACTIVE_ANON);
    2752                 :            : 
    2753                 :            :                 memcg = mem_cgroup_iter(NULL, memcg, NULL);
    2754                 :            :         } while (memcg);
    2755                 :            : }
    2756                 :            : 
    2757                 :          0 : static bool zone_balanced(struct zone *zone, int order,
    2758                 :            :                           unsigned long balance_gap, int classzone_idx)
    2759                 :            : {
    2760         [ +  + ]:     174493 :         if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
    2761                 :            :                                     balance_gap, classzone_idx, 0))
    2762                 :            :                 return false;
    2763                 :            : 
    2764   [ +  +  +  - ]:      47547 :         if (IS_ENABLED(CONFIG_COMPACTION) && order &&
    2765                 :        921 :             !compaction_suitable(zone, order))
    2766                 :            :                 return false;
    2767                 :            : 
    2768                 :            :         return true;
    2769                 :            : }
    2770                 :            : 
    2771                 :            : /*
    2772                 :            :  * pgdat_balanced() is used when checking if a node is balanced.
    2773                 :            :  *
    2774                 :            :  * For order-0, all zones must be balanced!
    2775                 :            :  *
    2776                 :            :  * For high-order allocations only zones that meet watermarks and are in a
    2777                 :            :  * zone allowed by the callers classzone_idx are added to balanced_pages. The
    2778                 :            :  * total of balanced pages must be at least 25% of the zones allowed by
    2779                 :            :  * classzone_idx for the node to be considered balanced. Forcing all zones to
    2780                 :            :  * be balanced for high orders can cause excessive reclaim when there are
    2781                 :            :  * imbalanced zones.
    2782                 :            :  * The choice of 25% is due to
    2783                 :            :  *   o a 16M DMA zone that is balanced will not balance a zone on any
    2784                 :            :  *     reasonable sized machine
    2785                 :            :  *   o On all other machines, the top zone must be at least a reasonable
    2786                 :            :  *     percentage of the middle zones. For example, on 32-bit x86, highmem
    2787                 :            :  *     would need to be at least 256M for it to be balance a whole node.
    2788                 :            :  *     Similarly, on x86-64 the Normal zone would need to be at least 1G
    2789                 :            :  *     to balance a node on its own. These seemed like reasonable ratios.
    2790                 :            :  */
    2791                 :          0 : static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
    2792                 :            : {
    2793                 :            :         unsigned long managed_pages = 0;
    2794                 :            :         unsigned long balanced_pages = 0;
    2795                 :            :         int i;
    2796                 :            : 
    2797                 :            :         /* Check the watermark levels */
    2798         [ +  + ]:      58143 :         for (i = 0; i <= classzone_idx; i++) {
    2799                 :      57171 :                 struct zone *zone = pgdat->node_zones + i;
    2800                 :            : 
    2801         [ -  + ]:      57171 :                 if (!populated_zone(zone))
    2802                 :          0 :                         continue;
    2803                 :            : 
    2804                 :      57171 :                 managed_pages += zone->managed_pages;
    2805                 :            : 
    2806                 :            :                 /*
    2807                 :            :                  * A special case here:
    2808                 :            :                  *
    2809                 :            :                  * balance_pgdat() skips over all_unreclaimable after
    2810                 :            :                  * DEF_PRIORITY. Effectively, it considers them balanced so
    2811                 :            :                  * they must be considered balanced here as well!
    2812                 :            :                  */
    2813         [ +  + ]:      57171 :                 if (!zone_reclaimable(zone)) {
    2814                 :      15590 :                         balanced_pages += zone->managed_pages;
    2815                 :      15590 :                         continue;
    2816                 :            :                 }
    2817                 :            : 
    2818         [ +  + ]:      41581 :                 if (zone_balanced(zone, order, 0, i))
    2819                 :      13966 :                         balanced_pages += zone->managed_pages;
    2820         [ +  + ]:      27615 :                 else if (!order)
    2821                 :            :                         return false;
    2822                 :            :         }
    2823                 :            : 
    2824         [ +  + ]:        972 :         if (order)
    2825                 :          2 :                 return balanced_pages >= (managed_pages >> 2);
    2826                 :            :         else
    2827                 :            :                 return true;
    2828                 :            : }
    2829                 :            : 
    2830                 :            : /*
    2831                 :            :  * Prepare kswapd for sleeping. This verifies that there are no processes
    2832                 :            :  * waiting in throttle_direct_reclaim() and that watermarks have been met.
    2833                 :            :  *
    2834                 :            :  * Returns true if kswapd is ready to sleep
    2835                 :            :  */
    2836                 :          0 : static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
    2837                 :            :                                         int classzone_idx)
    2838                 :            : {
    2839                 :            :         /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
    2840         [ +  + ]:       5060 :         if (remaining)
    2841                 :            :                 return false;
    2842                 :            : 
    2843                 :            :         /*
    2844                 :            :          * There is a potential race between when kswapd checks its watermarks
    2845                 :            :          * and a process gets throttled. There is also a potential race if
    2846                 :            :          * processes get throttled, kswapd wakes, a large process exits therby
    2847                 :            :          * balancing the zones that causes kswapd to miss a wakeup. If kswapd
    2848                 :            :          * is going to sleep, no process should be sleeping on pfmemalloc_wait
    2849                 :            :          * so wake them now if necessary. If necessary, processes will wake
    2850                 :            :          * kswapd and get throttled again
    2851                 :            :          */
    2852         [ -  + ]:       4592 :         if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
    2853                 :          0 :                 wake_up(&pgdat->pfmemalloc_wait);
    2854                 :          0 :                 return false;
    2855                 :            :         }
    2856                 :            : 
    2857                 :       4592 :         return pgdat_balanced(pgdat, order, classzone_idx);
    2858                 :            : }
    2859                 :            : 
    2860                 :            : /*
    2861                 :            :  * kswapd shrinks the zone by the number of pages required to reach
    2862                 :            :  * the high watermark.
    2863                 :            :  *
    2864                 :            :  * Returns true if kswapd scanned at least the requested number of pages to
    2865                 :            :  * reclaim or if the lack of progress was due to pages under writeback.
    2866                 :            :  * This is used to determine if the scanning priority needs to be raised.
    2867                 :            :  */
    2868                 :          0 : static bool kswapd_shrink_zone(struct zone *zone,
    2869                 :            :                                int classzone_idx,
    2870                 :            :                                struct scan_control *sc,
    2871                 :            :                                unsigned long lru_pages,
    2872                 :            :                                unsigned long *nr_attempted)
    2873                 :            : {
    2874                 :      40328 :         int testorder = sc->order;
    2875                 :            :         unsigned long balance_gap;
    2876                 :      40328 :         struct reclaim_state *reclaim_state = current->reclaim_state;
    2877                 :      80656 :         struct shrink_control shrink = {
    2878                 :      40328 :                 .gfp_mask = sc->gfp_mask,
    2879                 :            :         };
    2880                 :            :         bool lowmem_pressure;
    2881                 :            : 
    2882                 :            :         /* Reclaim above the high watermark. */
    2883                 :      40328 :         sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
    2884                 :            : 
    2885                 :            :         /*
    2886                 :            :          * Kswapd reclaims only single pages with compaction enabled. Trying
    2887                 :            :          * too hard to reclaim until contiguous free pages have become
    2888                 :            :          * available can hurt performance by evicting too much useful data
    2889                 :            :          * from memory. Do not reclaim more than needed for compaction.
    2890                 :            :          */
    2891   [ +  +  +  + ]:      40330 :         if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
    2892                 :          2 :                         compaction_suitable(zone, sc->order) !=
    2893                 :            :                                 COMPACT_SKIPPED)
    2894                 :            :                 testorder = 0;
    2895                 :            : 
    2896                 :            :         /*
    2897                 :            :          * We put equal pressure on every zone, unless one zone has way too
    2898                 :            :          * many pages free already. The "too many pages" is defined as the
    2899                 :            :          * high wmark plus a "gap" where the gap is either the low
    2900                 :            :          * watermark or 1% of the zone, whichever is smaller.
    2901                 :            :          */
    2902                 :      40328 :         balance_gap = min(low_wmark_pages(zone),
    2903                 :            :                 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
    2904                 :            :                 KSWAPD_ZONE_BALANCE_GAP_RATIO);
    2905                 :            : 
    2906                 :            :         /*
    2907                 :            :          * If there is no low memory pressure or the zone is balanced then no
    2908                 :            :          * reclaim is necessary
    2909                 :            :          */
    2910 [ -  + ][ #  # ]:      40328 :         lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
    2911 [ +  - ][ +  + ]:      40328 :         if (!lowmem_pressure && zone_balanced(zone, testorder,
    2912                 :            :                                                 balance_gap, classzone_idx))
    2913                 :            :                 return true;
    2914                 :            : 
    2915                 :      38376 :         shrink_zone(zone, sc);
    2916                 :            :         nodes_clear(shrink.nodes_to_scan);
    2917                 :            :         node_set(zone_to_nid(zone), shrink.nodes_to_scan);
    2918                 :            : 
    2919                 :      38376 :         reclaim_state->reclaimed_slab = 0;
    2920                 :      38376 :         shrink_slab(&shrink, sc->nr_scanned, lru_pages);
    2921                 :      38376 :         sc->nr_reclaimed += reclaim_state->reclaimed_slab;
    2922                 :            : 
    2923                 :            :         /* Account for the number of pages attempted to reclaim */
    2924                 :      38376 :         *nr_attempted += sc->nr_to_reclaim;
    2925                 :            : 
    2926                 :            :         zone_clear_flag(zone, ZONE_WRITEBACK);
    2927                 :            : 
    2928                 :            :         /*
    2929                 :            :          * If a zone reaches its high watermark, consider it to be no longer
    2930                 :            :          * congested. It's possible there are dirty pages backed by congested
    2931                 :            :          * BDIs but as pressure is relieved, speculatively avoid congestion
    2932                 :            :          * waits.
    2933                 :            :          */
    2934   [ +  +  +  + ]:      74837 :         if (zone_reclaimable(zone) &&
    2935                 :      36461 :             zone_balanced(zone, testorder, 0, classzone_idx)) {
    2936                 :            :                 zone_clear_flag(zone, ZONE_CONGESTED);
    2937                 :            :                 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
    2938                 :            :         }
    2939                 :            : 
    2940                 :      38376 :         return sc->nr_scanned >= sc->nr_to_reclaim;
    2941                 :            : }
    2942                 :            : 
    2943                 :            : /*
    2944                 :            :  * For kswapd, balance_pgdat() will work across all this node's zones until
    2945                 :            :  * they are all at high_wmark_pages(zone).
    2946                 :            :  *
    2947                 :            :  * Returns the final order kswapd was reclaiming at
    2948                 :            :  *
    2949                 :            :  * There is special handling here for zones which are full of pinned pages.
    2950                 :            :  * This can happen if the pages are all mlocked, or if they are all used by
    2951                 :            :  * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
    2952                 :            :  * What we do is to detect the case where all pages in the zone have been
    2953                 :            :  * scanned twice and there has been zero successful reclaim.  Mark the zone as
    2954                 :            :  * dead and from now on, only perform a short scan.  Basically we're polling
    2955                 :            :  * the zone for when the problem goes away.
    2956                 :            :  *
    2957                 :            :  * kswapd scans the zones in the highmem->normal->dma direction.  It skips
    2958                 :            :  * zones which have free_pages > high_wmark_pages(zone), but once a zone is
    2959                 :            :  * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
    2960                 :            :  * lower zones regardless of the number of free pages in the lower zones. This
    2961                 :            :  * interoperates with the page allocator fallback scheme to ensure that aging
    2962                 :            :  * of pages is balanced across the zones.
    2963                 :            :  */
    2964                 :          0 : static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
    2965                 :            :                                                         int *classzone_idx)
    2966                 :            : {
    2967                 :            :         int i;
    2968                 :            :         int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
    2969                 :            :         unsigned long nr_soft_reclaimed;
    2970                 :            :         unsigned long nr_soft_scanned;
    2971                 :       5060 :         struct scan_control sc = {
    2972                 :            :                 .gfp_mask = GFP_KERNEL,
    2973                 :            :                 .priority = DEF_PRIORITY,
    2974                 :            :                 .may_unmap = 1,
    2975                 :            :                 .may_swap = 1,
    2976                 :       2530 :                 .may_writepage = !laptop_mode,
    2977                 :            :                 .order = order,
    2978                 :            :                 .target_mem_cgroup = NULL,
    2979                 :            :         };
    2980                 :            :         count_vm_event(PAGEOUTRUN);
    2981                 :            : 
    2982                 :            :         do {
    2983                 :            :                 unsigned long lru_pages = 0;
    2984                 :      26045 :                 unsigned long nr_attempted = 0;
    2985                 :            :                 bool raise_priority = true;
    2986                 :      26045 :                 bool pgdat_needs_compaction = (order > 0);
    2987                 :            : 
    2988                 :      26045 :                 sc.nr_reclaimed = 0;
    2989                 :            : 
    2990                 :            :                 /*
    2991                 :            :                  * Scan in the highmem->dma direction for the highest
    2992                 :            :                  * zone which needs scanning
    2993                 :            :                  */
    2994         [ +  - ]:      26045 :                 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
    2995                 :      26045 :                         struct zone *zone = pgdat->node_zones + i;
    2996                 :            : 
    2997         [ -  + ]:      26045 :                         if (!populated_zone(zone))
    2998                 :          0 :                                 continue;
    2999                 :            : 
    3000   [ +  +  -  + ]:      49087 :                         if (sc.priority != DEF_PRIORITY &&
    3001                 :            :                             !zone_reclaimable(zone))
    3002                 :          0 :                                 continue;
    3003                 :            : 
    3004                 :            :                         /*
    3005                 :            :                          * Do some background aging of the anon list, to give
    3006                 :            :                          * pages a chance to be referenced before reclaiming.
    3007                 :            :                          */
    3008                 :      26045 :                         age_active_anon(zone, &sc);
    3009                 :            : 
    3010                 :            :                         /*
    3011                 :            :                          * If the number of buffer_heads in the machine
    3012                 :            :                          * exceeds the maximum allowed level and this node
    3013                 :            :                          * has a highmem zone, force kswapd to reclaim from
    3014                 :            :                          * it to relieve lowmem pressure.
    3015                 :            :                          */
    3016 [ -  + ][ #  # ]:      26045 :                         if (buffer_heads_over_limit && is_highmem_idx(i)) {
    3017                 :            :                                 end_zone = i;
    3018                 :            :                                 break;
    3019                 :            :                         }
    3020                 :            : 
    3021         [ -  + ]:      26045 :                         if (!zone_balanced(zone, order, 0, 0)) {
    3022                 :            :                                 end_zone = i;
    3023                 :            :                                 break;
    3024                 :            :                         } else {
    3025                 :            :                                 /*
    3026                 :            :                                  * If balanced, clear the dirty and congested
    3027                 :            :                                  * flags
    3028                 :            :                                  */
    3029                 :            :                                 zone_clear_flag(zone, ZONE_CONGESTED);
    3030                 :            :                                 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
    3031                 :            :                         }
    3032                 :            :                 }
    3033                 :            : 
    3034         [ +  - ]:      26045 :                 if (i < 0)
    3035                 :            :                         goto out;
    3036                 :            : 
    3037         [ +  + ]:      78135 :                 for (i = 0; i <= end_zone; i++) {
    3038                 :      52090 :                         struct zone *zone = pgdat->node_zones + i;
    3039                 :            : 
    3040         [ -  + ]:      52090 :                         if (!populated_zone(zone))
    3041                 :          0 :                                 continue;
    3042                 :            : 
    3043                 :      52090 :                         lru_pages += zone_reclaimable_pages(zone);
    3044                 :            : 
    3045                 :            :                         /*
    3046                 :            :                          * If any zone is currently balanced then kswapd will
    3047                 :            :                          * not call compaction as it is expected that the
    3048                 :            :                          * necessary pages are already available.
    3049                 :            :                          */
    3050   [ +  +  +  - ]:      52091 :                         if (pgdat_needs_compaction &&
    3051                 :          1 :                                         zone_watermark_ok(zone, order,
    3052                 :            :                                                 low_wmark_pages(zone),
    3053                 :            :                                                 *classzone_idx, 0))
    3054                 :            :                                 pgdat_needs_compaction = false;
    3055                 :            :                 }
    3056                 :            : 
    3057                 :            :                 /*
    3058                 :            :                  * If we're getting trouble reclaiming, start doing writepage
    3059                 :            :                  * even in laptop mode.
    3060                 :            :                  */
    3061         [ +  + ]:      26045 :                 if (sc.priority < DEF_PRIORITY - 2)
    3062                 :      26045 :                         sc.may_writepage = 1;
    3063                 :            : 
    3064                 :            :                 /*
    3065                 :            :                  * Now scan the zone in the dma->highmem direction, stopping
    3066                 :            :                  * at the last zone which needs scanning.
    3067                 :            :                  *
    3068                 :            :                  * We do this because the page allocator works in the opposite
    3069                 :            :                  * direction.  This prevents the page allocator from allocating
    3070                 :            :                  * pages behind kswapd's direction of progress, which would
    3071                 :            :                  * cause too much scanning of the lower zones.
    3072                 :            :                  */
    3073         [ +  + ]:      78135 :                 for (i = 0; i <= end_zone; i++) {
    3074                 :      52090 :                         struct zone *zone = pgdat->node_zones + i;
    3075                 :            : 
    3076         [ -  + ]:      52090 :                         if (!populated_zone(zone))
    3077                 :          0 :                                 continue;
    3078                 :            : 
    3079   [ +  +  +  + ]:      98174 :                         if (sc.priority != DEF_PRIORITY &&
    3080                 :            :                             !zone_reclaimable(zone))
    3081                 :      11762 :                                 continue;
    3082                 :            : 
    3083                 :      40328 :                         sc.nr_scanned = 0;
    3084                 :            : 
    3085                 :            :                         nr_soft_scanned = 0;
    3086                 :            :                         /*
    3087                 :            :                          * Call soft limit reclaim before calling shrink_zone.
    3088                 :            :                          */
    3089                 :            :                         nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
    3090                 :            :                                                         order, sc.gfp_mask,
    3091                 :            :                                                         &nr_soft_scanned);
    3092                 :            :                         sc.nr_reclaimed += nr_soft_reclaimed;
    3093                 :            : 
    3094                 :            :                         /*
    3095                 :            :                          * There should be no need to raise the scanning
    3096                 :            :                          * priority if enough pages are already being scanned
    3097                 :            :                          * that that high watermark would be met at 100%
    3098                 :            :                          * efficiency.
    3099                 :            :                          */
    3100         [ +  + ]:      40328 :                         if (kswapd_shrink_zone(zone, end_zone, &sc,
    3101                 :            :                                         lru_pages, &nr_attempted))
    3102                 :            :                                 raise_priority = false;
    3103                 :            :                 }
    3104                 :            : 
    3105                 :            :                 /*
    3106                 :            :                  * If the low watermark is met there is no need for processes
    3107                 :            :                  * to be throttled on pfmemalloc_wait as they should not be
    3108                 :            :                  * able to safely make forward progress. Wake them
    3109                 :            :                  */
    3110   [ -  +  #  # ]:      26045 :                 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
    3111                 :          0 :                                 pfmemalloc_watermark_ok(pgdat))
    3112                 :          0 :                         wake_up(&pgdat->pfmemalloc_wait);
    3113                 :            : 
    3114                 :            :                 /*
    3115                 :            :                  * Fragmentation may mean that the system cannot be rebalanced
    3116                 :            :                  * for high-order allocations in all zones. If twice the
    3117                 :            :                  * allocation size has been reclaimed and the zones are still
    3118                 :            :                  * not balanced then recheck the watermarks at order-0 to
    3119                 :            :                  * prevent kswapd reclaiming excessively. Assume that a
    3120                 :            :                  * process requested a high-order can direct reclaim/compact.
    3121                 :            :                  */
    3122 [ +  + ][ -  + ]:      26045 :                 if (order && sc.nr_reclaimed >= 2UL << order)
    3123                 :          0 :                         order = sc.order = 0;
    3124                 :            : 
    3125                 :            :                 /* Check if kswapd should be suspending */
    3126 [ +  - ][ +  - ]:      26045 :                 if (try_to_freeze() || kthread_should_stop())
    3127                 :            :                         break;
    3128                 :            : 
    3129                 :            :                 /*
    3130                 :            :                  * Compact if necessary and kswapd is reclaiming at least the
    3131                 :            :                  * high watermark number of pages as requsted
    3132                 :            :                  */
    3133 [ -  + ][ #  # ]:      26045 :                 if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
    3134                 :          0 :                         compact_pgdat(pgdat, order);
    3135                 :            : 
    3136                 :            :                 /*
    3137                 :            :                  * Raise priority if scanning rate is too low or there was no
    3138                 :            :                  * progress in reclaiming pages
    3139                 :            :                  */
    3140 [ +  + ][ +  + ]:      26045 :                 if (raise_priority || !sc.nr_reclaimed)
    3141                 :      25319 :                         sc.priority--;
    3142         [ +  + ]:      23994 :         } while (sc.priority >= 1 &&
    3143         [ +  + ]:      26045 :                  !pgdat_balanced(pgdat, order, *classzone_idx));
    3144                 :            : 
    3145                 :            : out:
    3146                 :            :         /*
    3147                 :            :          * Return the order we were reclaiming at so prepare_kswapd_sleep()
    3148                 :            :          * makes a decision on the order we were last reclaiming at. However,
    3149                 :            :          * if another caller entered the allocator slow path while kswapd
    3150                 :            :          * was awake, order will remain at the higher level
    3151                 :            :          */
    3152                 :       2530 :         *classzone_idx = end_zone;
    3153                 :       2530 :         return order;
    3154                 :            : }
    3155                 :            : 
    3156                 :          0 : static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
    3157                 :            : {
    3158                 :            :         long remaining = 0;
    3159                 :       5060 :         DEFINE_WAIT(wait);
    3160                 :            : 
    3161 [ +  - ][ +  - ]:       2530 :         if (freezing(current) || kthread_should_stop())
    3162                 :          0 :                 return;
    3163                 :            : 
    3164                 :       2530 :         prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
    3165                 :            : 
    3166                 :            :         /* Try to sleep for a short interval */
    3167         [ +  + ]:       2530 :         if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
    3168                 :        482 :                 remaining = schedule_timeout(HZ/10);
    3169                 :        482 :                 finish_wait(&pgdat->kswapd_wait, &wait);
    3170                 :        482 :                 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
    3171                 :            :         }
    3172                 :            : 
    3173                 :            :         /*
    3174                 :            :          * After a short sleep, check if it was a premature sleep. If not, then
    3175                 :            :          * go fully to sleep until explicitly woken up.
    3176                 :            :          */
    3177         [ +  + ]:       2530 :         if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
    3178                 :         11 :                 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
    3179                 :            : 
    3180                 :            :                 /*
    3181                 :            :                  * vmstat counters are not perfectly accurate and the estimated
    3182                 :            :                  * value for counters such as NR_FREE_PAGES can deviate from the
    3183                 :            :                  * true value by nr_online_cpus * threshold. To avoid the zone
    3184                 :            :                  * watermarks being breached while under pressure, we reduce the
    3185                 :            :                  * per-cpu vmstat threshold while kswapd is awake and restore
    3186                 :            :                  * them before going back to sleep.
    3187                 :            :                  */
    3188                 :         11 :                 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
    3189                 :            : 
    3190                 :            :                 /*
    3191                 :            :                  * Compaction records what page blocks it recently failed to
    3192                 :            :                  * isolate pages from and skips them in the future scanning.
    3193                 :            :                  * When kswapd is going to sleep, it is reasonable to assume
    3194                 :            :                  * that pages and compaction may succeed so reset the cache.
    3195                 :            :                  */
    3196                 :         11 :                 reset_isolation_suitable(pgdat);
    3197                 :            : 
    3198         [ +  - ]:         11 :                 if (!kthread_should_stop())
    3199                 :         11 :                         schedule();
    3200                 :            : 
    3201                 :         11 :                 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
    3202                 :            :         } else {
    3203         [ +  + ]:       2519 :                 if (remaining)
    3204                 :            :                         count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
    3205                 :            :                 else
    3206                 :            :                         count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
    3207                 :            :         }
    3208                 :       2530 :         finish_wait(&pgdat->kswapd_wait, &wait);
    3209                 :            : }
    3210                 :            : 
    3211                 :            : /*
    3212                 :            :  * The background pageout daemon, started as a kernel thread
    3213                 :            :  * from the init process.
    3214                 :            :  *
    3215                 :            :  * This basically trickles out pages so that we have _some_
    3216                 :            :  * free memory available even if there is no other activity
    3217                 :            :  * that frees anything up. This is needed for things like routing
    3218                 :            :  * etc, where we otherwise might have all activity going on in
    3219                 :            :  * asynchronous contexts that cannot page things out.
    3220                 :            :  *
    3221                 :            :  * If there are applications that are active memory-allocators
    3222                 :            :  * (most normal use), this basically shouldn't matter.
    3223                 :            :  */
    3224                 :          0 : static int kswapd(void *p)
    3225                 :            : {
    3226                 :            :         unsigned long order, new_order;
    3227                 :            :         unsigned balanced_order;
    3228                 :            :         int classzone_idx, new_classzone_idx;
    3229                 :            :         int balanced_classzone_idx;
    3230                 :            :         pg_data_t *pgdat = (pg_data_t*)p;
    3231                 :          0 :         struct task_struct *tsk = current;
    3232                 :            : 
    3233                 :          0 :         struct reclaim_state reclaim_state = {
    3234                 :            :                 .reclaimed_slab = 0,
    3235                 :            :         };
    3236                 :          0 :         const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
    3237                 :            : 
    3238                 :            :         lockdep_set_current_reclaim_state(GFP_KERNEL);
    3239                 :            : 
    3240         [ #  # ]:          0 :         if (!cpumask_empty(cpumask))
    3241                 :          0 :                 set_cpus_allowed_ptr(tsk, cpumask);
    3242                 :          0 :         current->reclaim_state = &reclaim_state;
    3243                 :            : 
    3244                 :            :         /*
    3245                 :            :          * Tell the memory management that we're a "memory allocator",
    3246                 :            :          * and that if we need more memory we should get access to it
    3247                 :            :          * regardless (see "__alloc_pages()"). "kswapd" should
    3248                 :            :          * never get caught in the normal page freeing logic.
    3249                 :            :          *
    3250                 :            :          * (Kswapd normally doesn't need memory anyway, but sometimes
    3251                 :            :          * you need a small amount of memory in order to be able to
    3252                 :            :          * page out something else, and this flag essentially protects
    3253                 :            :          * us from recursively trying to free more memory as we're
    3254                 :            :          * trying to free the first piece of memory in the first place).
    3255                 :            :          */
    3256                 :          0 :         tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
    3257                 :          0 :         set_freezable();
    3258                 :            : 
    3259                 :            :         order = new_order = 0;
    3260                 :            :         balanced_order = 0;
    3261                 :          0 :         classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
    3262                 :       2530 :         balanced_classzone_idx = classzone_idx;
    3263                 :            :         for ( ; ; ) {
    3264                 :            :                 bool ret;
    3265                 :            : 
    3266                 :            :                 /*
    3267                 :            :                  * If the last balance_pgdat was unsuccessful it's unlikely a
    3268                 :            :                  * new request of a similar or harder type will succeed soon
    3269                 :            :                  * so consider going to sleep on the basis we reclaimed at
    3270                 :            :                  */
    3271         [ +  - ]:       2530 :                 if (balanced_classzone_idx >= new_classzone_idx &&
    3272                 :       2530 :                                         balanced_order == new_order) {
    3273                 :       2530 :                         new_order = pgdat->kswapd_max_order;
    3274                 :       2530 :                         new_classzone_idx = pgdat->classzone_idx;
    3275                 :       2530 :                         pgdat->kswapd_max_order =  0;
    3276                 :       2530 :                         pgdat->classzone_idx = pgdat->nr_zones - 1;
    3277                 :            :                 }
    3278                 :            : 
    3279         [ +  - ]:       2530 :                 if (order < new_order || classzone_idx > new_classzone_idx) {
    3280                 :            :                         /*
    3281                 :            :                          * Don't sleep if someone wants a larger 'order'
    3282                 :            :                          * allocation or has tigher zone constraints
    3283                 :            :                          */
    3284                 :            :                         order = new_order;
    3285                 :            :                         classzone_idx = new_classzone_idx;
    3286                 :            :                 } else {
    3287                 :       2530 :                         kswapd_try_to_sleep(pgdat, balanced_order,
    3288                 :            :                                                 balanced_classzone_idx);
    3289                 :       2530 :                         order = pgdat->kswapd_max_order;
    3290                 :       2530 :                         classzone_idx = pgdat->classzone_idx;
    3291                 :            :                         new_order = order;
    3292                 :            :                         new_classzone_idx = classzone_idx;
    3293                 :       2530 :                         pgdat->kswapd_max_order = 0;
    3294                 :       2530 :                         pgdat->classzone_idx = pgdat->nr_zones - 1;
    3295                 :            :                 }
    3296                 :            : 
    3297                 :            :                 ret = try_to_freeze();
    3298         [ +  - ]:       2530 :                 if (kthread_should_stop())
    3299                 :            :                         break;
    3300                 :            : 
    3301                 :            :                 /*
    3302                 :            :                  * We can speed up thawing tasks if we don't call balance_pgdat
    3303                 :            :                  * after returning from the refrigerator
    3304                 :            :                  */
    3305         [ -  + ]:       2530 :                 if (!ret) {
    3306                 :       2530 :                         trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
    3307                 :       2530 :                         balanced_classzone_idx = classzone_idx;
    3308                 :       2530 :                         balanced_order = balance_pgdat(pgdat, order,
    3309                 :            :                                                 &balanced_classzone_idx);
    3310                 :            :                 }
    3311                 :            :         }
    3312                 :            : 
    3313                 :          0 :         current->reclaim_state = NULL;
    3314                 :          0 :         return 0;
    3315                 :            : }
    3316                 :            : 
    3317                 :            : /*
    3318                 :            :  * A zone is low on free memory, so wake its kswapd task to service it.
    3319                 :            :  */
    3320                 :          0 : void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
    3321                 :            : {
    3322                 :            :         pg_data_t *pgdat;
    3323                 :            : 
    3324         [ +  + ]:      75713 :         if (!populated_zone(zone))
    3325                 :            :                 return;
    3326                 :            : 
    3327                 :            :         if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    3328                 :            :                 return;
    3329                 :      75688 :         pgdat = zone->zone_pgdat;
    3330         [ +  + ]:      75688 :         if (pgdat->kswapd_max_order < order) {
    3331                 :          3 :                 pgdat->kswapd_max_order = order;
    3332                 :          3 :                 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
    3333                 :            :         }
    3334         [ +  + ]:      75688 :         if (!waitqueue_active(&pgdat->kswapd_wait))
    3335                 :            :                 return;
    3336         [ +  + ]:      30068 :         if (zone_balanced(zone, order, 0, 0))
    3337                 :            :                 return;
    3338                 :            : 
    3339                 :        484 :         trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
    3340                 :        484 :         wake_up_interruptible(&pgdat->kswapd_wait);
    3341                 :            : }
    3342                 :            : 
    3343                 :            : /*
    3344                 :            :  * The reclaimable count would be mostly accurate.
    3345                 :            :  * The less reclaimable pages may be
    3346                 :            :  * - mlocked pages, which will be moved to unevictable list when encountered
    3347                 :            :  * - mapped pages, which may require several travels to be reclaimed
    3348                 :            :  * - dirty pages, which is not "instantly" reclaimable
    3349                 :            :  */
    3350                 :          0 : unsigned long global_reclaimable_pages(void)
    3351                 :            : {
    3352                 :            :         int nr;
    3353                 :            : 
    3354                 :     234717 :         nr = global_page_state(NR_ACTIVE_FILE) +
    3355                 :            :              global_page_state(NR_INACTIVE_FILE);
    3356                 :            : 
    3357            [ + ]:     234717 :         if (get_nr_swap_pages() > 0)
    3358                 :         16 :                 nr += global_page_state(NR_ACTIVE_ANON) +
    3359                 :            :                       global_page_state(NR_INACTIVE_ANON);
    3360                 :            : 
    3361                 :          0 :         return nr;
    3362                 :            : }
    3363                 :            : 
    3364                 :            : #ifdef CONFIG_HIBERNATION
    3365                 :            : /*
    3366                 :            :  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
    3367                 :            :  * freed pages.
    3368                 :            :  *
    3369                 :            :  * Rather than trying to age LRUs the aim is to preserve the overall
    3370                 :            :  * LRU order by reclaiming preferentially
    3371                 :            :  * inactive > active > active referenced > active mapped
    3372                 :            :  */
    3373                 :            : unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
    3374                 :            : {
    3375                 :            :         struct reclaim_state reclaim_state;
    3376                 :            :         struct scan_control sc = {
    3377                 :            :                 .gfp_mask = GFP_HIGHUSER_MOVABLE,
    3378                 :            :                 .may_swap = 1,
    3379                 :            :                 .may_unmap = 1,
    3380                 :            :                 .may_writepage = 1,
    3381                 :            :                 .nr_to_reclaim = nr_to_reclaim,
    3382                 :            :                 .hibernation_mode = 1,
    3383                 :            :                 .order = 0,
    3384                 :            :                 .priority = DEF_PRIORITY,
    3385                 :            :         };
    3386                 :            :         struct shrink_control shrink = {
    3387                 :            :                 .gfp_mask = sc.gfp_mask,
    3388                 :            :         };
    3389                 :            :         struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
    3390                 :            :         struct task_struct *p = current;
    3391                 :            :         unsigned long nr_reclaimed;
    3392                 :            : 
    3393                 :            :         p->flags |= PF_MEMALLOC;
    3394                 :            :         lockdep_set_current_reclaim_state(sc.gfp_mask);
    3395                 :            :         reclaim_state.reclaimed_slab = 0;
    3396                 :            :         p->reclaim_state = &reclaim_state;
    3397                 :            : 
    3398                 :            :         nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
    3399                 :            : 
    3400                 :            :         p->reclaim_state = NULL;
    3401                 :            :         lockdep_clear_current_reclaim_state();
    3402                 :            :         p->flags &= ~PF_MEMALLOC;
    3403                 :            : 
    3404                 :            :         return nr_reclaimed;
    3405                 :            : }
    3406                 :            : #endif /* CONFIG_HIBERNATION */
    3407                 :            : 
    3408                 :            : /* It's optimal to keep kswapds on the same CPUs as their memory, but
    3409                 :            :    not required for correctness.  So if the last cpu in a node goes
    3410                 :            :    away, we get changed to run anywhere: as the first one comes back,
    3411                 :            :    restore their cpu bindings. */
    3412                 :          0 : static int cpu_callback(struct notifier_block *nfb, unsigned long action,
    3413                 :            :                         void *hcpu)
    3414                 :            : {
    3415                 :            :         int nid;
    3416                 :            : 
    3417         [ #  # ]:          0 :         if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
    3418         [ #  # ]:          0 :                 for_each_node_state(nid, N_MEMORY) {
    3419                 :            :                         pg_data_t *pgdat = NODE_DATA(nid);
    3420                 :            :                         const struct cpumask *mask;
    3421                 :            : 
    3422                 :          0 :                         mask = cpumask_of_node(pgdat->node_id);
    3423                 :            : 
    3424         [ #  # ]:          0 :                         if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
    3425                 :            :                                 /* One of our CPUs online: restore mask */
    3426                 :          0 :                                 set_cpus_allowed_ptr(pgdat->kswapd, mask);
    3427                 :            :                 }
    3428                 :            :         }
    3429                 :          0 :         return NOTIFY_OK;
    3430                 :            : }
    3431                 :            : 
    3432                 :            : /*
    3433                 :            :  * This kswapd start function will be called by init and node-hot-add.
    3434                 :            :  * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
    3435                 :            :  */
    3436                 :          0 : int kswapd_run(int nid)
    3437                 :            : {
    3438                 :            :         pg_data_t *pgdat = NODE_DATA(nid);
    3439                 :            :         int ret = 0;
    3440                 :            : 
    3441         [ #  # ]:          0 :         if (pgdat->kswapd)
    3442                 :            :                 return 0;
    3443                 :            : 
    3444         [ #  # ]:          0 :         pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
    3445         [ #  # ]:          0 :         if (IS_ERR(pgdat->kswapd)) {
    3446                 :            :                 /* failure at boot is fatal */
    3447         [ #  # ]:          0 :                 BUG_ON(system_state == SYSTEM_BOOTING);
    3448                 :          0 :                 pr_err("Failed to start kswapd on node %d\n", nid);
    3449                 :          0 :                 ret = PTR_ERR(pgdat->kswapd);
    3450                 :          0 :                 pgdat->kswapd = NULL;
    3451                 :            :         }
    3452                 :          0 :         return ret;
    3453                 :            : }
    3454                 :            : 
    3455                 :            : /*
    3456                 :            :  * Called by memory hotplug when all memory in a node is offlined.  Caller must
    3457                 :            :  * hold lock_memory_hotplug().
    3458                 :            :  */
    3459                 :          0 : void kswapd_stop(int nid)
    3460                 :            : {
    3461                 :          0 :         struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
    3462                 :            : 
    3463         [ #  # ]:          0 :         if (kswapd) {
    3464                 :          0 :                 kthread_stop(kswapd);
    3465                 :          0 :                 NODE_DATA(nid)->kswapd = NULL;
    3466                 :            :         }
    3467                 :          0 : }
    3468                 :            : 
    3469                 :          0 : static int __init kswapd_init(void)
    3470                 :            : {
    3471                 :            :         int nid;
    3472                 :            : 
    3473                 :          0 :         swap_setup();
    3474         [ #  # ]:          0 :         for_each_node_state(nid, N_MEMORY)
    3475                 :          0 :                 kswapd_run(nid);
    3476                 :          0 :         hotcpu_notifier(cpu_callback, 0);
    3477                 :          0 :         return 0;
    3478                 :            : }
    3479                 :            : 
    3480                 :            : module_init(kswapd_init)
    3481                 :            : 
    3482                 :            : #ifdef CONFIG_NUMA
    3483                 :            : /*
    3484                 :            :  * Zone reclaim mode
    3485                 :            :  *
    3486                 :            :  * If non-zero call zone_reclaim when the number of free pages falls below
    3487                 :            :  * the watermarks.
    3488                 :            :  */
    3489                 :            : int zone_reclaim_mode __read_mostly;
    3490                 :            : 
    3491                 :            : #define RECLAIM_OFF 0
    3492                 :            : #define RECLAIM_ZONE (1<<0)       /* Run shrink_inactive_list on the zone */
    3493                 :            : #define RECLAIM_WRITE (1<<1)      /* Writeout pages during reclaim */
    3494                 :            : #define RECLAIM_SWAP (1<<2)       /* Swap pages out during reclaim */
    3495                 :            : 
    3496                 :            : /*
    3497                 :            :  * Priority for ZONE_RECLAIM. This determines the fraction of pages
    3498                 :            :  * of a node considered for each zone_reclaim. 4 scans 1/16th of
    3499                 :            :  * a zone.
    3500                 :            :  */
    3501                 :            : #define ZONE_RECLAIM_PRIORITY 4
    3502                 :            : 
    3503                 :            : /*
    3504                 :            :  * Percentage of pages in a zone that must be unmapped for zone_reclaim to
    3505                 :            :  * occur.
    3506                 :            :  */
    3507                 :            : int sysctl_min_unmapped_ratio = 1;
    3508                 :            : 
    3509                 :            : /*
    3510                 :            :  * If the number of slab pages in a zone grows beyond this percentage then
    3511                 :            :  * slab reclaim needs to occur.
    3512                 :            :  */
    3513                 :            : int sysctl_min_slab_ratio = 5;
    3514                 :            : 
    3515                 :            : static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
    3516                 :            : {
    3517                 :            :         unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
    3518                 :            :         unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
    3519                 :            :                 zone_page_state(zone, NR_ACTIVE_FILE);
    3520                 :            : 
    3521                 :            :         /*
    3522                 :            :          * It's possible for there to be more file mapped pages than
    3523                 :            :          * accounted for by the pages on the file LRU lists because
    3524                 :            :          * tmpfs pages accounted for as ANON can also be FILE_MAPPED
    3525                 :            :          */
    3526                 :            :         return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
    3527                 :            : }
    3528                 :            : 
    3529                 :            : /* Work out how many page cache pages we can reclaim in this reclaim_mode */
    3530                 :            : static long zone_pagecache_reclaimable(struct zone *zone)
    3531                 :            : {
    3532                 :            :         long nr_pagecache_reclaimable;
    3533                 :            :         long delta = 0;
    3534                 :            : 
    3535                 :            :         /*
    3536                 :            :          * If RECLAIM_SWAP is set, then all file pages are considered
    3537                 :            :          * potentially reclaimable. Otherwise, we have to worry about
    3538                 :            :          * pages like swapcache and zone_unmapped_file_pages() provides
    3539                 :            :          * a better estimate
    3540                 :            :          */
    3541                 :            :         if (zone_reclaim_mode & RECLAIM_SWAP)
    3542                 :            :                 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
    3543                 :            :         else
    3544                 :            :                 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
    3545                 :            : 
    3546                 :            :         /* If we can't clean pages, remove dirty pages from consideration */
    3547                 :            :         if (!(zone_reclaim_mode & RECLAIM_WRITE))
    3548                 :            :                 delta += zone_page_state(zone, NR_FILE_DIRTY);
    3549                 :            : 
    3550                 :            :         /* Watch for any possible underflows due to delta */
    3551                 :            :         if (unlikely(delta > nr_pagecache_reclaimable))
    3552                 :            :                 delta = nr_pagecache_reclaimable;
    3553                 :            : 
    3554                 :            :         return nr_pagecache_reclaimable - delta;
    3555                 :            : }
    3556                 :            : 
    3557                 :            : /*
    3558                 :            :  * Try to free up some pages from this zone through reclaim.
    3559                 :            :  */
    3560                 :            : static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
    3561                 :            : {
    3562                 :            :         /* Minimum pages needed in order to stay on node */
    3563                 :            :         const unsigned long nr_pages = 1 << order;
    3564                 :            :         struct task_struct *p = current;
    3565                 :            :         struct reclaim_state reclaim_state;
    3566                 :            :         struct scan_control sc = {
    3567                 :            :                 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
    3568                 :            :                 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
    3569                 :            :                 .may_swap = 1,
    3570                 :            :                 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
    3571                 :            :                 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
    3572                 :            :                 .order = order,
    3573                 :            :                 .priority = ZONE_RECLAIM_PRIORITY,
    3574                 :            :         };
    3575                 :            :         struct shrink_control shrink = {
    3576                 :            :                 .gfp_mask = sc.gfp_mask,
    3577                 :            :         };
    3578                 :            :         unsigned long nr_slab_pages0, nr_slab_pages1;
    3579                 :            : 
    3580                 :            :         cond_resched();
    3581                 :            :         /*
    3582                 :            :          * We need to be able to allocate from the reserves for RECLAIM_SWAP
    3583                 :            :          * and we also need to be able to write out pages for RECLAIM_WRITE
    3584                 :            :          * and RECLAIM_SWAP.
    3585                 :            :          */
    3586                 :            :         p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
    3587                 :            :         lockdep_set_current_reclaim_state(gfp_mask);
    3588                 :            :         reclaim_state.reclaimed_slab = 0;
    3589                 :            :         p->reclaim_state = &reclaim_state;
    3590                 :            : 
    3591                 :            :         if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
    3592                 :            :                 /*
    3593                 :            :                  * Free memory by calling shrink zone with increasing
    3594                 :            :                  * priorities until we have enough memory freed.
    3595                 :            :                  */
    3596                 :            :                 do {
    3597                 :            :                         shrink_zone(zone, &sc);
    3598                 :            :                 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
    3599                 :            :         }
    3600                 :            : 
    3601                 :            :         nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
    3602                 :            :         if (nr_slab_pages0 > zone->min_slab_pages) {
    3603                 :            :                 /*
    3604                 :            :                  * shrink_slab() does not currently allow us to determine how
    3605                 :            :                  * many pages were freed in this zone. So we take the current
    3606                 :            :                  * number of slab pages and shake the slab until it is reduced
    3607                 :            :                  * by the same nr_pages that we used for reclaiming unmapped
    3608                 :            :                  * pages.
    3609                 :            :                  */
    3610                 :            :                 nodes_clear(shrink.nodes_to_scan);
    3611                 :            :                 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
    3612                 :            :                 for (;;) {
    3613                 :            :                         unsigned long lru_pages = zone_reclaimable_pages(zone);
    3614                 :            : 
    3615                 :            :                         /* No reclaimable slab or very low memory pressure */
    3616                 :            :                         if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
    3617                 :            :                                 break;
    3618                 :            : 
    3619                 :            :                         /* Freed enough memory */
    3620                 :            :                         nr_slab_pages1 = zone_page_state(zone,
    3621                 :            :                                                         NR_SLAB_RECLAIMABLE);
    3622                 :            :                         if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
    3623                 :            :                                 break;
    3624                 :            :                 }
    3625                 :            : 
    3626                 :            :                 /*
    3627                 :            :                  * Update nr_reclaimed by the number of slab pages we
    3628                 :            :                  * reclaimed from this zone.
    3629                 :            :                  */
    3630                 :            :                 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
    3631                 :            :                 if (nr_slab_pages1 < nr_slab_pages0)
    3632                 :            :                         sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
    3633                 :            :         }
    3634                 :            : 
    3635                 :            :         p->reclaim_state = NULL;
    3636                 :            :         current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
    3637                 :            :         lockdep_clear_current_reclaim_state();
    3638                 :            :         return sc.nr_reclaimed >= nr_pages;
    3639                 :            : }
    3640                 :            : 
    3641                 :            : int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
    3642                 :            : {
    3643                 :            :         int node_id;
    3644                 :            :         int ret;
    3645                 :            : 
    3646                 :            :         /*
    3647                 :            :          * Zone reclaim reclaims unmapped file backed pages and
    3648                 :            :          * slab pages if we are over the defined limits.
    3649                 :            :          *
    3650                 :            :          * A small portion of unmapped file backed pages is needed for
    3651                 :            :          * file I/O otherwise pages read by file I/O will be immediately
    3652                 :            :          * thrown out if the zone is overallocated. So we do not reclaim
    3653                 :            :          * if less than a specified percentage of the zone is used by
    3654                 :            :          * unmapped file backed pages.
    3655                 :            :          */
    3656                 :            :         if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
    3657                 :            :             zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
    3658                 :            :                 return ZONE_RECLAIM_FULL;
    3659                 :            : 
    3660                 :            :         if (!zone_reclaimable(zone))
    3661                 :            :                 return ZONE_RECLAIM_FULL;
    3662                 :            : 
    3663                 :            :         /*
    3664                 :            :          * Do not scan if the allocation should not be delayed.
    3665                 :            :          */
    3666                 :            :         if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
    3667                 :            :                 return ZONE_RECLAIM_NOSCAN;
    3668                 :            : 
    3669                 :            :         /*
    3670                 :            :          * Only run zone reclaim on the local zone or on zones that do not
    3671                 :            :          * have associated processors. This will favor the local processor
    3672                 :            :          * over remote processors and spread off node memory allocations
    3673                 :            :          * as wide as possible.
    3674                 :            :          */
    3675                 :            :         node_id = zone_to_nid(zone);
    3676                 :            :         if (node_state(node_id, N_CPU) && node_id != numa_node_id())
    3677                 :            :                 return ZONE_RECLAIM_NOSCAN;
    3678                 :            : 
    3679                 :            :         if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
    3680                 :            :                 return ZONE_RECLAIM_NOSCAN;
    3681                 :            : 
    3682                 :            :         ret = __zone_reclaim(zone, gfp_mask, order);
    3683                 :            :         zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
    3684                 :            : 
    3685                 :            :         if (!ret)
    3686                 :            :                 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
    3687                 :            : 
    3688                 :            :         return ret;
    3689                 :            : }
    3690                 :            : #endif
    3691                 :            : 
    3692                 :            : /*
    3693                 :            :  * page_evictable - test whether a page is evictable
    3694                 :            :  * @page: the page to test
    3695                 :            :  *
    3696                 :            :  * Test whether page is evictable--i.e., should be placed on active/inactive
    3697                 :            :  * lists vs unevictable list.
    3698                 :            :  *
    3699                 :            :  * Reasons page might not be evictable:
    3700                 :            :  * (1) page's mapping marked unevictable
    3701                 :            :  * (2) page is part of an mlocked VMA
    3702                 :            :  *
    3703                 :            :  */
    3704                 :          0 : int page_evictable(struct page *page)
    3705                 :            : {
    3706 [ +  + ][ +  + ]:     552878 :         return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
    3707                 :            : }
    3708                 :            : 
    3709                 :            : #ifdef CONFIG_SHMEM
    3710                 :            : /**
    3711                 :            :  * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
    3712                 :            :  * @pages:      array of pages to check
    3713                 :            :  * @nr_pages:   number of pages to check
    3714                 :            :  *
    3715                 :            :  * Checks pages for evictability and moves them to the appropriate lru list.
    3716                 :            :  *
    3717                 :            :  * This function is only used for SysV IPC SHM_UNLOCK.
    3718                 :            :  */
    3719                 :          0 : void check_move_unevictable_pages(struct page **pages, int nr_pages)
    3720                 :            : {
    3721                 :            :         struct lruvec *lruvec;
    3722                 :            :         struct zone *zone = NULL;
    3723                 :            :         int pgscanned = 0;
    3724                 :            :         int pgrescued = 0;
    3725                 :            :         int i;
    3726                 :            : 
    3727         [ +  + ]:          2 :         for (i = 0; i < nr_pages; i++) {
    3728                 :          1 :                 struct page *page = pages[i];
    3729                 :            :                 struct zone *pagezone;
    3730                 :            : 
    3731                 :          1 :                 pgscanned++;
    3732                 :          1 :                 pagezone = page_zone(page);
    3733         [ +  - ]:          1 :                 if (pagezone != zone) {
    3734         [ -  + ]:          1 :                         if (zone)
    3735                 :            :                                 spin_unlock_irq(&zone->lru_lock);
    3736                 :            :                         zone = pagezone;
    3737                 :            :                         spin_lock_irq(&zone->lru_lock);
    3738                 :            :                 }
    3739                 :            :                 lruvec = mem_cgroup_page_lruvec(page, zone);
    3740                 :            : 
    3741 [ +  - ][ +  - ]:          2 :                 if (!PageLRU(page) || !PageUnevictable(page))
    3742                 :          1 :                         continue;
    3743                 :            : 
    3744         [ #  # ]:          0 :                 if (page_evictable(page)) {
    3745                 :            :                         enum lru_list lru = page_lru_base_type(page);
    3746                 :            : 
    3747                 :            :                         VM_BUG_ON(PageActive(page));
    3748                 :            :                         ClearPageUnevictable(page);
    3749                 :            :                         del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
    3750                 :            :                         add_page_to_lru_list(page, lruvec, lru);
    3751                 :          0 :                         pgrescued++;
    3752                 :            :                 }
    3753                 :            :         }
    3754                 :            : 
    3755         [ +  - ]:          1 :         if (zone) {
    3756                 :            :                 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
    3757                 :            :                 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
    3758                 :            :                 spin_unlock_irq(&zone->lru_lock);
    3759                 :            :         }
    3760                 :          1 : }
    3761                 :            : #endif /* CONFIG_SHMEM */
    3762                 :            : 
    3763                 :          0 : static void warn_scan_unevictable_pages(void)
    3764                 :            : {
    3765         [ +  + ]:          2 :         printk_once(KERN_WARNING
    3766                 :            :                     "%s: The scan_unevictable_pages sysctl/node-interface has been "
    3767                 :            :                     "disabled for lack of a legitimate use case.  If you have "
    3768                 :            :                     "one, please send an email to linux-mm@kvack.org.\n",
    3769                 :            :                     current->comm);
    3770                 :          0 : }
    3771                 :            : 
    3772                 :            : /*
    3773                 :            :  * scan_unevictable_pages [vm] sysctl handler.  On demand re-scan of
    3774                 :            :  * all nodes' unevictable lists for evictable pages
    3775                 :            :  */
    3776                 :            : unsigned long scan_unevictable_pages;
    3777                 :            : 
    3778                 :          0 : int scan_unevictable_handler(struct ctl_table *table, int write,
    3779                 :            :                            void __user *buffer,
    3780                 :            :                            size_t *length, loff_t *ppos)
    3781                 :            : {
    3782                 :          2 :         warn_scan_unevictable_pages();
    3783                 :          2 :         proc_doulongvec_minmax(table, write, buffer, length, ppos);
    3784                 :          2 :         scan_unevictable_pages = 0;
    3785                 :          2 :         return 0;
    3786                 :            : }
    3787                 :            : 
    3788                 :            : #ifdef CONFIG_NUMA
    3789                 :            : /*
    3790                 :            :  * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
    3791                 :            :  * a specified node's per zone unevictable lists for evictable pages.
    3792                 :            :  */
    3793                 :            : 
    3794                 :            : static ssize_t read_scan_unevictable_node(struct device *dev,
    3795                 :            :                                           struct device_attribute *attr,
    3796                 :            :                                           char *buf)
    3797                 :            : {
    3798                 :            :         warn_scan_unevictable_pages();
    3799                 :            :         return sprintf(buf, "0\n");   /* always zero; should fit... */
    3800                 :            : }
    3801                 :            : 
    3802                 :            : static ssize_t write_scan_unevictable_node(struct device *dev,
    3803                 :            :                                            struct device_attribute *attr,
    3804                 :            :                                         const char *buf, size_t count)
    3805                 :            : {
    3806                 :            :         warn_scan_unevictable_pages();
    3807                 :            :         return 1;
    3808                 :            : }
    3809                 :            : 
    3810                 :            : 
    3811                 :            : static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
    3812                 :            :                         read_scan_unevictable_node,
    3813                 :            :                         write_scan_unevictable_node);
    3814                 :            : 
    3815                 :            : int scan_unevictable_register_node(struct node *node)
    3816                 :            : {
    3817                 :            :         return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
    3818                 :            : }
    3819                 :            : 
    3820                 :            : void scan_unevictable_unregister_node(struct node *node)
    3821                 :            : {
    3822                 :            :         device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
    3823                 :            : }
    3824                 :            : #endif

Generated by: LCOV version 1.9