从零到负一

【LM09】Memblock到伙伴系统的切换

2023/04/25

【LM07】重新回顾内存初始化这篇笔记中,我简单列举了start_kernel()中的一些和内存管理相关的函数。这篇笔记主要分析mm_init(),同时也顺便完成对build_all_zonelists()的简要分析。

build_all_zonelists()

这部分还是比较重要的,当学到伙伴系统后,发现有几个知识点和zonelist相关,但之前由于没有仔细学习zonelist,不太理解,这里重新回来学习zonelist。我们先来看相关的结构体,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
// include/linux/mmzone.h

enum zone_type {
#ifdef CONFIG_ZONE_DMA
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
ZONE_DMA32,
#endif
ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
ZONE_HIGHMEM,
#endif
ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE,
#endif
__MAX_NR_ZONES
};

/*
* This struct contains information about a zone in a zonelist. It is stored
* here to avoid dereferences into large structures and lookups of tables
*/
struct zoneref {
struct zone *zone; /* Pointer to actual zone */
int zone_idx; /* zone_idx(zoneref->zone) */
};

/*
* One allocation request operates on a zonelist. A zonelist
* is a list of zones, the first one is the 'goal' of the
* allocation, the other zones are fallback zones, in decreasing
* priority.
*
* To speed the reading of the zonelist, the zonerefs contain the zone index
* of the entry being read. Helper functions to access information given
* a struct zoneref are
*
* zonelist_zone() - Return the struct zone * for an entry in _zonerefs
* zonelist_zone_idx() - Return the index of the zone for an entry
* zonelist_node_idx() - Return the index of the node for an entry
*/
struct zonelist {
// 数组最后一个元素为空,可以用于结束zone的遍历 - 参考for_next_zone_zonelist_nodemask()
struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};

typedef struct pglist_data {
// node中有MAX_NR_ZONES个zone,且它们是按照上面枚举中的顺序排列的
struct zone node_zones[MAX_NR_ZONES];
// 在UMA中,node_zonelists[]只有一个zonelist
struct zonelist node_zonelists[MAX_ZONELISTS];
}

接下来我们会去看zonelist是如何完成初始化的。 但在这之前,我简单讲下zonelist是干什么的 - 内核在进行内存分配时,zonelist负责提供一个Node以及Zone的优先级。内核先从优先级高的NodeZone中寻找空闲的内存空间,如果没有足够的内存空间,再去优先级低的NodeZone中寻找(更多细节请参考内存管理(一)node & zone),下面我们来看看build_all_zonelists()这个函数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
// ./mm/page_alloc.c

// 我们从build_all_zonelists()开始,一步一步到build_zonerefs_node()函数
// build_all_zonelists() -> build_all_zonelists_init() -> __build_all_zonelists()
// -> build_zonelists() -> build_zonelists_in_node_order() -> build_zonerefs_node()
void __ref build_all_zonelists(pg_data_t *pgdat)
{
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init();
} else {
__build_all_zonelists(pgdat);
/* cpuset refresh routine should be here */
}
}

static noinline void __init build_all_zonelists_init(void)
{
__build_all_zonelists(NULL);
}

static void __build_all_zonelists(void *data)
{
/*
* This node is hotadded and no memory is yet present. So just
* building zonelists is fine - no need to touch other nodes.
*/
if (self && !node_online(self->node_id)) {
build_zonelists(self);
} else {
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat);
}
}
}

/*
* Build zonelists ordered by zone and nodes within zones.
* This results in conserving DMA zone[s] until all Normal memory is
* exhausted, but results in overflowing to remote node while memory
* may still exist in local DMA zone.
*/
static void build_zonelists(pg_data_t *pgdat)
{
// pgdat是当前CPU所在的Node
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
}

/*
* Build zonelists ordered by node and zones within node.
* This results in maximum locality--normal zone overflows into local
* DMA zone, if any--but risks exhausting DMA zone.
*/
static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
unsigned nr_nodes)
{
struct zoneref *zonerefs;
int i;
// --------------------------------------------------------------- (1)
// 通过当前节点获取zonelist的信息
zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;

for (i = 0; i < nr_nodes; i++) {
int nr_zones;
pg_data_t *node = NODE_DATA(node_order[i]);
// ----------------------------------------------------------- (2)
// 更新获取的zonelist,注意这里是每遍历一个Node就将其所有Zone排序后放入zonerefs
// 因此,高优先级的Node中最低优先级的Zone也比低优先级的Node中最高优先级的Zone优先级高
// zonerefs是按照如下顺序进行排序:
// [N0Z3][N0Z2][N0Z0][N0Z1] | [N2Z3][N2Z1][N2Z0][N2Z2] | ...
nr_zones = build_zonerefs_node(node, zonerefs);
zonerefs += nr_zones;
}
zonerefs->zone = NULL;
zonerefs->zone_idx = 0;
}

/*
* Builds allocation fallback zone lists.
* Add all populated zones of a node to the zonelist.
*/
static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
{
struct zone *zone;
enum zone_type zone_type = MAX_NR_ZONES;
int nr_zones = 0;

do {
// ----------------------------------------------------------- (3)
// 注意这里zone_type是从最大开始减少
zone_type--;
zone = pgdat->node_zones + zone_type;
if (managed_zone(zone)) {
// ------------------------------------------------------- (4)
// zone_type靠后的zone先放在zonerefs中,比如有ZONE_HIGHMEM的系统
// ZONE_HIGHMEM就会放在zonerefs[0]
// zoneref->zone = zone;
// zoneref->zone_idx = zone_idx(zone);
zoneref_set_zone(zone, &zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
} while (zone_type);
return nr_zones;
}

至此,zonelist创建成功。有一点需要注意,(4)处zonerefs[]的索引和其元素的zone_idx并不相同(可以说毫无关系),不要误认为它们是一样的。

mm_init()

mm_init()中,mem_init()是最重要的函数之一,是它完成了memblock到伙伴系统的切换,下面我会花大量的时间来分析这个函数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
// ./init/main.c

/*
* Set up kernel memory allocators
*/
static void __init mm_init(void)
{
/*
* page_ext requires contiguous pages,
* bigger than MAX_ORDER unless SPARSEMEM.
*/
// 在sparsemem内存模型下,该函数为空
page_ext_init_flatmem();
// 这个是重点函数
mem_init();
kmem_cache_init();
pgtable_init();
debug_objects_mem_init();
vmalloc_init();
ioremap_huge_init();
/* Should be run before the first non-init thread is created */
init_espfix_bsp();
/* Should be run after espfix64 is set up. */
pti_init();
}

mem_init()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// ./arch/arm64/mm/init.c

/*
* mem_init() marks the free areas in the mem_map and tells us how much memory
* is free. This is done after various parts of the system have claimed their
* memory after the kernel image.
*/
void __init mem_init(void)
{
if (swiotlb_force == SWIOTLB_FORCE ||
max_pfn > (arm64_dma_phys_limit >> PAGE_SHIFT))
swiotlb_init(1);
else
swiotlb_force = SWIOTLB_NO_FORCE;
// 更新max_mapnr,在sparsemem模型中,我们并不使用它,同时也不会用mem_map
// 因此在sparsemem启动的情况下,这句可以忽略
set_max_mapnr(pfn_to_page(max_pfn) - mem_map);

#ifndef CONFIG_SPARSEMEM_VMEMMAP
// ----------------------------------------------------------- (1)
free_unused_memmap();
#endif
/* this will put all unused low memory onto the freelists */
memblock_free_all();

kexec_reserve_crashkres_pages();

mem_init_print_info(NULL);

/*
* Check boundaries twice: Some fundamental inconsistencies can be
* detected at build time already.
*/
#ifdef CONFIG_COMPAT
BUILD_BUG_ON(TASK_SIZE_32 > DEFAULT_MAP_WINDOW_64);
#endif

if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
extern int sysctl_overcommit_memory;
/*
* On a machine this small we won't get anywhere without
* overcommit, so turn it on by default.
*/
sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
}
}

(1) 这句用于释放之前已经分配但没使用的mem_map空间。这个函数同样作用于sparsemem,尽管它不用mem_map。下面来看看它的源码,

free_unused_memmap()

这个函数大方向很容易理解,释放掉那些不需要管理的物理页对应的page,但细节我总觉得有些地方不太明白,不深究了,直接看注释吧。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// ./arch/arm64/mm/init.c

/*
* The mem_map array can get very big. Free the unused area of the memory map.
*/
static void __init free_unused_memmap(void)
{
unsigned long start, prev_end = 0;
struct memblock_region *reg;

for_each_memblock(memory, reg) {
start = __phys_to_pfn(reg->base);

// 以我个人的理解, sparsemem中没有用到过多的内存空间存不存在的page,因此这里并不会
// free之前memblock分配的内存空间
#ifdef CONFIG_SPARSEMEM
/*
* Take care not to free memmap entries that don't exist due
* to SPARSEMEM sections which aren't present.
*/
// 这里的ALIGN()是向上对齐
start = min(start, ALIGN(prev_end, PAGES_PER_SECTION));
#endif
/*
* If we had a previous bank, and there is a space between the
* current bank and the previous, free it.
*/
if (prev_end && prev_end < start)
free_memmap(prev_end, start);

/*
* Align up here since the VM subsystem insists that the
* memmap entries are valid from the bank end aligned to
* MAX_ORDER_NR_PAGES.
*/
prev_end = ALIGN(__phys_to_pfn(reg->base + reg->size),
MAX_ORDER_NR_PAGES);
}

// 对于sparsemem,唯一需要释放的内存空间是最后一个PFN到ALIGN(prev_end, PAGES_PER_SECTION)之间的内存空间。
// 我个人觉得,如果要释放,应该释放每个memblock.memory中最后一个PFN到ALIGN(prev_end, PAGES_PER_SECTION)
// 的内存空间,而不仅仅是最后一个memblock.memory中的内存空间。
#ifdef CONFIG_SPARSEMEM
if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION))
free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION));
#endif
}

memblock_free_all()

这个函数是核心函数,它用于将memblock中管理的空闲页释放给伙伴系统。我们来看看它调用了哪些函数,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// ./mm/memblock.c

/**
* memblock_free_all - release free pages to the buddy allocator
*
* Return: the number of pages actually released.
*/
unsigned long __init memblock_free_all(void)
{
unsigned long pages;
// 遍历Node, Zone将所有zone中的z->managed_pages设置为0
reset_all_zones_managed_pages();

pages = free_low_memory_core_early();
totalram_pages_add(pages);

return pages;
}

这里重点看看free_low_memory_core_early()函数,

free_low_memory_core_early()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
// ./mm/memblock.c

static unsigned long __init free_low_memory_core_early(void)
{
unsigned long count = 0;
phys_addr_t start, end;
u64 i;

memblock_clear_hotplug(0, -1);
// --------------------------------------------- (1)
for_each_reserved_mem_region(i, &start, &end)
reserve_bootmem_region(start, end);

/*
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/
// --------------------------------------------- (2)
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL)
count += __free_memory_core(start, end);

return count;
}

(1) 遍历memblock中所有reservedregion,然后对每个region中的page进行一些初始化,其中主要有下面3个步骤
(1.a) init_reserved_page(start_pfn) - 这个函数这里为空
(1.b) INIT_LIST_HEAD(&page->lru) - 初始化页中的LRU
(1.c) __SetPageReserved(page) - 将页设置成reserved
(2) 这个循环遍历所有memblock中的memory区域,接下来我们重点看看这个函数,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
// ./mm/memblock.c

static unsigned long __init __free_memory_core(phys_addr_t start,
phys_addr_t end)
{
unsigned long start_pfn = PFN_UP(start);
unsigned long end_pfn = min_t(unsigned long, PFN_DOWN(end), max_low_pfn);

if (start_pfn >= end_pfn)
return 0;
// ------------------------------------------ (1)
__free_pages_memory(start_pfn, end_pfn);

return end_pfn - start_pfn;
}

(1) 我们直接看这个函数,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
// ./mm/memblock.c

static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
int order;

while (start < end) {
order = min(MAX_ORDER - 1UL, __ffs(start));

while (start + (1UL << order) > end)
order--;

memblock_free_pages(pfn_to_page(start), start, order);

start += (1UL << order);
}
}

简单来说,这个函数最后会将memblock.memory中的内存空间释放到伙伴系统。关于伙伴系统我就不在这篇笔记进行分析了,接下来我会开几篇笔记讲解伙伴系统。我们重新回到mm_init()函数,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
// ./init/main.c

/*
* Set up kernel memory allocators
*/
static void __init mm_init(void)
{
/*
* page_ext requires contiguous pages,
* bigger than MAX_ORDER unless SPARSEMEM.
*/
// 在sparsemem内存模型下,该函数为空
page_ext_init_flatmem();
// 这个是重点函数
mem_init();
kmem_cache_init();
pgtable_init();
debug_objects_mem_init();
vmalloc_init();
ioremap_huge_init();
/* Should be run before the first non-init thread is created */
init_espfix_bsp();
/* Should be run after espfix64 is set up. */
pti_init();
}

在这里剩下的函数中,有几个函数我不太清楚是干什么的,但kmem_cache_init()vmalloc_init()很重要,都是之后会详细介绍的,这里就不逐个分析了。至此,可以认为内存管理器已经从memblock切换到了伙伴系统。这里有个细节需要注意,伙伴系统只有memblock.memory中的内存空间,memblock.reserve中的内存空间依然处于reserved状态,并且不会被伙伴系统使用。

参考资料

  1. 内存管理(一)node & zone
  2. memblock 内存分配器原理和代码分析
  3. Linux内存管理,mmblock怎么把内存给到伙伴系统的?
CATALOG
  1. 1. build_all_zonelists()
  2. 2. mm_init()
    1. 2.1. mem_init()
      1. 2.1.1. free_unused_memmap()
      2. 2.1.2. memblock_free_all()
        1. 2.1.2.1. free_low_memory_core_early()
  3. 3. 参考资料