]> git.dujemihanovic.xyz Git - linux.git/commitdiff
mm/vmalloc: fix numa spreading for large hash tables
authorChen Wandun <chenwandun@huawei.com>
Thu, 28 Oct 2021 21:36:24 +0000 (14:36 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 29 Oct 2021 00:18:55 +0000 (17:18 -0700)
Eric Dumazet reported a strange numa spreading info in [1], and found
commit 121e6f3258fe ("mm/vmalloc: hugepage vmalloc mappings") introduced
this issue [2].

Dig into the difference before and after this patch, page allocation has
some difference:

before:
  alloc_large_system_hash
    __vmalloc
      __vmalloc_node(..., NUMA_NO_NODE, ...)
        __vmalloc_node_range
          __vmalloc_area_node
            alloc_page /* because NUMA_NO_NODE, so choose alloc_page branch */
              alloc_pages_current
                alloc_page_interleave /* can be proved by print policy mode */

after:
  alloc_large_system_hash
    __vmalloc
      __vmalloc_node(..., NUMA_NO_NODE, ...)
        __vmalloc_node_range
          __vmalloc_area_node
            alloc_pages_node /* choose nid by nuam_mem_id() */
              __alloc_pages_node(nid, ....)

So after commit 121e6f3258fe ("mm/vmalloc: hugepage vmalloc mappings"),
it will allocate memory in current node instead of interleaving allocate
memory.

Link: https://lore.kernel.org/linux-mm/CANn89iL6AAyWhfxdHO+jaT075iOa3XcYn9k6JJc7JR2XYn6k_Q@mail.gmail.com/
Link: https://lore.kernel.org/linux-mm/CANn89iLofTR=AK-QOZY87RdUZENCZUT4O6a0hvhu3_EwRMerOg@mail.gmail.com/
Link: https://lkml.kernel.org/r/20211021080744.874701-2-chenwandun@huawei.com
Fixes: 121e6f3258fe ("mm/vmalloc: hugepage vmalloc mappings")
Signed-off-by: Chen Wandun <chenwandun@huawei.com>
Reported-by: Eric Dumazet <edumazet@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Uladzislau Rezki <urezki@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
mm/vmalloc.c

index d77830ff604cab1796a96e31ad1423a790c61ca7..e8a807c7811077414c4efbced29a16ab75f15b22 100644 (file)
@@ -2816,6 +2816,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
                unsigned int order, unsigned int nr_pages, struct page **pages)
 {
        unsigned int nr_allocated = 0;
+       struct page *page;
+       int i;
 
        /*
         * For order-0 pages we make use of bulk allocator, if
@@ -2823,7 +2825,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
         * to fails, fallback to a single page allocator that is
         * more permissive.
         */
-       if (!order) {
+       if (!order && nid != NUMA_NO_NODE) {
                while (nr_allocated < nr_pages) {
                        unsigned int nr, nr_pages_request;
 
@@ -2848,7 +2850,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
                        if (nr != nr_pages_request)
                                break;
                }
-       } else
+       } else if (order)
                /*
                 * Compound pages required for remap_vmalloc_page if
                 * high-order pages.
@@ -2856,11 +2858,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
                gfp |= __GFP_COMP;
 
        /* High-order pages or fallback path if "bulk" fails. */
-       while (nr_allocated < nr_pages) {
-               struct page *page;
-               int i;
 
-               page = alloc_pages_node(nid, gfp, order);
+       while (nr_allocated < nr_pages) {
+               if (nid == NUMA_NO_NODE)
+                       page = alloc_pages(gfp, order);
+               else
+                       page = alloc_pages_node(nid, gfp, order);
                if (unlikely(!page))
                        break;