从零到负一

【LM08】Node和Zone的初始化

2023/04/10

在之前的笔记【LM06】bootmem_init()和Linux的物理内存模型中,我已经分析了bootmem_init()中大部分的内容。剩下的内容就是对Node以及Zone等进行初始化。在上一篇笔记【LM07】重新回顾内存初始化中,我介绍了NodeZone及其结构体,我们现在来看看如何对它们进行初始化。

zone_sizes_init()

我们继续回到bootmem_init()函数,这里我们直接看zone_sizes_init()函数。这是一个繁杂的函数,一层一层看下去比较头疼。没办法,硬着头皮上吧。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
// ./arch/arm64/mm/init.c

void __init bootmem_init(void)
{
unsigned long min, max;

min = PFN_UP(memblock_start_of_DRAM());
max = PFN_DOWN(memblock_end_of_DRAM());

early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);

max_pfn = max_low_pfn = max;

arm64_numa_init();
/*
* Sparsemem tries to allocate bootmem in memory_present(), so must be
* done after the fixed reservations.
*/
arm64_memory_present();
sparse_init();
zone_sizes_init(min, max);

memblock_dump_all();
}

上面是bootmem_init()函数,我们可以看到它的倒数第二个函数就是我们要分析的zone_sizes_init(),下面直接上源码吧。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
// ./include/linux/mmzone.h

// 我们先来看看zone_type的定义
enum zone_type {
#ifdef CONFIG_ZONE_DMA
/*
* ZONE_DMA is used when there are devices that are not able
* to do DMA to all of addressable memory (ZONE_NORMAL). Then we
* carve out the portion of memory that is needed for these devices.
* The range is arch specific.
*
* Some examples
*
* Architecture Limit
* ---------------------------
* parisc, ia64, sparc <4G
* s390, powerpc <2G
* arm Various
* alpha Unlimited or 0-16MB.
*
* i386, x86_64 and multiple other arches
* <16M.
*/
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
/*
* x86_64 needs two ZONE_DMAs because it supports devices that are
* only able to do DMA to the lower 16M but also 32 bit devices that
* can only do DMA areas below 4G.
*/
ZONE_DMA32,
#endif
/*
* Normal addressable memory is in ZONE_NORMAL. DMA operations can be
* performed on pages in ZONE_NORMAL if the DMA devices support
* transfers to all addressable memory.
*/
ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
/*
* A memory area that is only addressable by the kernel through
* mapping portions into its own address space. This is for example
* used by i386 to allow the kernel to address the memory beyond
* 900MB. The kernel will set up special mappings (page
* table entries on i386) for each page that the kernel needs to
* access.
*/
ZONE_HIGHMEM,
#endif
ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE,
#endif
__MAX_NR_ZONES
};

// /arch/arm64/mm/init.c

// min = 所有memblock.memory中第一个PFN, max = 所有memblock.memory中最后一个PFN
// 这个函数在调用free_area_init_node()前主要还是通过遍历memblock.memory来更新zone_size[]和zhole_size[]
static void __init zone_sizes_init(unsigned long min, unsigned long max)
{
struct memblock_region *reg;
unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
// max_dma = memblock.memory中第一个PFN
unsigned long max_dma = min;

memset(zone_size, 0, sizeof(zone_size));

/* 4GB maximum for 32-bit only capable devices */
#ifdef CONFIG_ZONE_DMA32
max_dma = PFN_DOWN(arm64_dma_phys_limit);
zone_size[ZONE_DMA32] = max_dma - min;
#endif
// ZONE_NORMAL在这里用完了所有的memblock中memory region的内存空间
zone_size[ZONE_NORMAL] = max - max_dma;

memcpy(zhole_size, zone_size, sizeof(zhole_size));

// -------------------------------------------------------------- (1)
for_each_memblock(memory, reg) {
unsigned long start = memblock_region_memory_base_pfn(reg);
unsigned long end = memblock_region_memory_end_pfn(reg);

if (start >= max)
continue;

#ifdef CONFIG_ZONE_DMA32
if (start < max_dma) {
unsigned long dma_end = min(end, max_dma);
zhole_size[ZONE_DMA32] -= dma_end - start;
}
#endif
if (end > max_dma) {
unsigned long normal_end = min(end, max);
unsigned long normal_start = max(start, max_dma);
zhole_size[ZONE_NORMAL] -= normal_end - normal_start;
}
}

// -------------------------------------------------------------- (2)
free_area_init_node(0, zone_size, min, zhole_size);
}

(1) 通过遍历memblock.memory中的region来确定zhole_size[ZONE_NORMAL]的大小;
(2) 初始化Node以及free_area,具体代码分析如下。

free_area_init_node()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
// ./mm/page_alloc.c

void __init free_area_init_node(int nid, unsigned long *zones_size,
unsigned long node_start_pfn,
unsigned long *zholes_size)
{
// ------------------------------------------------------ (1)
// #define NODE_DATA(nid) (&contig_page_data)
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = 0;
unsigned long end_pfn = 0;

/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);

pgdat->node_id = nid; // nid = 0
pgdat->node_start_pfn = node_start_pfn; // node_start_pfn = min = 所有memblock.memory中第一个PFN
pgdat->per_cpu_nodestats = NULL;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
#else
start_pfn = node_start_pfn;
#endif
// ------------------------------------------------------ (2)
calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size);

// in sparsemem mode, this function is {}
alloc_node_mem_map(pgdat);
// this function is {}
pgdat_set_deferred_range(pgdat);

// ------------------------------------------------------ (3)
free_area_init_core(pgdat);
}

(1) 在UMA架构下只有一个Node,该Node是静态分配的contig_page_data
(2) 计算Node中所有以及实际出现的物理页;
(3) 初始化Node的核心函数。
下面分别看看(2)和(3)中牵涉的函数。

calculate_node_totalpages()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zones_size,
unsigned long *zholes_size)
{
unsigned long realtotalpages = 0, totalpages = 0;
enum zone_type i;

for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
unsigned long zone_start_pfn, zone_end_pfn;
unsigned long size, real_size;

// --------------------------------------------------------------- (1)
size = zone_spanned_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn,
&zone_start_pfn, &zone_end_pfn, zones_size);

// --------------------------------------------------------------- (2)
real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, zholes_size);
if (size)
zone->zone_start_pfn = zone_start_pfn;
else
zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;

totalpages += size;
realtotalpages += real_size;
}

// --------------------------------------------------------------- (3)
pgdat->node_spanned_pages = totalpages;
pgdat->node_present_pages = realtotalpages;
printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
}

(1) spanned_pages指的是所有的物理页,zone_spanned_pages_in_node()函数计算Zone中所有的物理页;
(2) 用所有物理页减去Zone中处于hole中的物理页得到实际存在的物理页;
(3) 更新Node中和页数量相关的成员变量。

free_area_init_core()

这个函数会调用大量的其它函数来对Zone, free_area进行初始化,下面具体看看做了哪些初始化。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/*
* Set up the zone data structures:
* - mark all pages reserved
* - mark all memory queues empty
* - clear the memory bitmaps
*
* NOTE: pgdat should get zeroed by caller.
* NOTE: this function is only called during early init.
*/
static void __init free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;

// ------------------------------------------------------------ (1)
pgdat_init_internals(pgdat);
pgdat->per_cpu_nodestats = &boot_nodestats;

// 遍历所有的Zone并进行初始化(这里其实只有DMA和NORMAL两个Zone)
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, freesize, memmap_pages;
unsigned long zone_start_pfn = zone->zone_start_pfn;

size = zone->spanned_pages;
freesize = zone->present_pages;

/*
* Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
// 如果有太多空缺的页(size > 1.25 * freesize),那么就用实际出现的页进行计算
memmap_pages = calc_memmap_size(size, freesize);

// ------------------------------------------------------------ (2)
if (!is_highmem_idx(j)) {
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
if (memmap_pages)
printk(KERN_DEBUG
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
}

/* Account for reserved pages */
if (j == 0 && freesize > dma_reserve) {
freesize -= dma_reserve;
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}

if (!is_highmem_idx(j))
nr_kernel_pages += freesize;
/* Charge for highmem memmap if there are enough kernel pages */
else if (nr_kernel_pages > memmap_pages * 2)
nr_kernel_pages -= memmap_pages;
nr_all_pages += freesize;

// ------------------------------------------------------------ (3)
/*
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
zone_init_internals(zone, j, nid, freesize);

if (!size)
continue;

// 如果开启CONFIG_HUGETLB_PAGE_SIZE_VARIABLE,那么将更新pageblock_order
// 否则是一个空函数
set_pageblock_order();
// ------------------------------------------------------------ (4)
setup_usemap(pgdat, zone, zone_start_pfn, size);
// ------------------------------------------------------------ (5)
init_currently_empty_zone(zone, zone_start_pfn, size);
// ------------------------------------------------------------ (6)
memmap_init(size, nid, j, zone_start_pfn);
}
}

这里对大部分函数我就不进一步展开了,如果需要可以直接去看源码。
(1) 这里主要对Node做一些lock, list等的初始化;
(2) 从(2)到(3)之间主要是计算Zone中各种页的数量,这些数据会存储在Zone中或者相应的全局变量中;
(3) 这个函数对Zone进行部分初始化;
(4) 主要是根据usemap分配页空间给zone->pageblock_flags,关于usemap目前我还不太了解作用,之后弄明白了再过来更新;
(5)和(6)我这里简单分析下,

init_currently_empty_zone()

这个函数主要是初始化free_area,这个在之后的伙伴系统中会被使用。这里的初始化仅仅指的是将free_area中的free_list头节点初始化。 free_list中并未添加任何节点。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
void __meminit init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
unsigned long size)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int zone_idx = zone_idx(zone) + 1;

if (zone_idx > pgdat->nr_zones)
pgdat->nr_zones = zone_idx;

zone->zone_start_pfn = zone_start_pfn;

mminit_dprintk(MMINIT_TRACE, "memmap_init",
"Initialising map node %d zone %lu pfns %lu -> %lu\n",
pgdat->node_id,
(unsigned long)zone_idx(zone),
zone_start_pfn, (zone_start_pfn + size));

zone_init_free_lists(zone);
zone->initialized = 1;
}

static void __meminit zone_init_free_lists(struct zone *zone)
{
unsigned int order, t;
for_each_migratetype_order(order, t) {
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
}
}

memmap_init()

这个函数对Zone中的页进行初始化。这里就和前面的sparsemem联系在一起了,通过pfn_to_page()我们找到任何有效的page。找到page后,调用 __init_single_page()函数来对page进行简单的初始化。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
void __meminit __weak memmap_init(unsigned long size, int nid,
unsigned long zone, unsigned long start_pfn)
{
memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
}

/*
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context,
struct vmem_altmap *altmap)
{
unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;

if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;

#ifdef CONFIG_ZONE_DEVICE
/*
* Honor reservation requested by the driver for this ZONE_DEVICE
* memory. We limit the total number of pages to initialize to just
* those that might contain the memory mapping. We will defer the
* ZONE_DEVICE page initialization until after we have released
* the hotplug lock.
*/
if (zone == ZONE_DEVICE) {
if (!altmap)
return;

if (start_pfn == altmap->base_pfn)
start_pfn += altmap->reserve;
end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
}
#endif

for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
if (!early_pfn_valid(pfn))
continue;
if (!early_pfn_in_nid(pfn, nid))
continue;
if (overlap_memmap_init(zone, &pfn))
continue;
if (defer_init(nid, pfn, end_pfn))
break;
}

page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
if (context == MEMMAP_HOTPLUG)
__SetPageReserved(page);

/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
* kernel allocations are made.
*
* bitmap is created for zone's valid pfn range. but memmap
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
}
}

至此,Node, Zone等都完成了初始化。这里面的内容很多,但并不需要全都搞懂。之后在进行伙伴系统以及slab的学习过程中还需要不断地回顾、理解,到时候或许又有新的理解了。

参考资料

  1. 【原创】(五)Linux内存管理zone_sizes_init
CATALOG
  1. 1. zone_sizes_init()
  2. 2. free_area_init_node()
    1. 2.1. calculate_node_totalpages()
    2. 2.2. free_area_init_core()
      1. 2.2.1. init_currently_empty_zone()
      2. 2.2.2. memmap_init()
  3. 3. 参考资料