从零到负一

FIXMAP和相关页表的创建

2023/01/15

接上篇笔记,在上一篇笔记中,我们完成了最初的几个页表的创建,接下来就要开始start_kernel的工作了。到目前为止,内核还不知道硬件有多少内存空间,也不知道其它硬件的信息。Linux内核通过DTS文件来识别硬件并获取相关信息。这里有个重要的信息就是内存的大小。

下面这个例子来自LoyenWang的博客

1
2
3
4
5
// arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
memory@80000000 {
device_type = "memory";
reg = <0x00000000 0x80000000 0 0x80000000>; /* DRAM space - 1, size : 2 GB DRAM */
};

有了这些信息,内核就知道内存的大小并且可以开始管理内存了。但这里还有一个问题,这些信息在启动时会被复制到内存空间,因此我们知道其物理地址。但这部分信息并不属于内核,因此我们还没有对其进行VA->PA的转换。没有这个转换,我们就不能访问DTS的信息。因此,我们首先要做的就是建立这个映射。当然,除了DTS,Linux内核还会对一些其它的地址进行映射。因此,内核专门搞了一个FIXMAP来完成这些空间的映射。

初看FIXMAP

我们首先需要看看这个FIXMAP到底是怎样分配的。在物理内存空间中,FIXMAP按照下面这个枚举的顺序排列的,FIX_HOLE在最上面,FIX_PGD在最下面,它的最小单位是PAGA_SIZE。从下面注释中可以看出,除了FDT,其它部分基本都只占一个或者几个页。要访问某个部分也很简单,直接用FIXADDR_TOP - idx * PAGE_SIZE即可,类似于数组的模式。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
// ./arch/arm64/include/asm/fixmap.h

/*
* Here we define all the compile-time 'special' virtual
* addresses. The point is to have a constant address at
* compile time, but to set the physical address only
* in the boot process.
*
* These 'compile-time allocated' memory buffers are
* page-sized. Use set_fixmap(idx,phys) to associate
* physical memory with fixmap indices.
*
*/

/*
FIX_HOLE = 0
FIX_FDT_END = 1
FIX_FDT = 4MB / 4KB - 1 = 1023
FIX_EARLYCON_MEM_BASE = 1024
FIX_TEXT_POKE0 = 1025
FIX_APEI_GHES_IRQ = 1026
FIX_APEI_GHES_NMI = 1027
FIX_ENTRY_TRAMP_DATA = 1028
FIX_ENTRY_TRAMP_TEXT = 1029
__end_of_permanent_fixed_addresses = 1030 <----------------- 1030 * 4KB
FIX_BTMAP_END = 1030
FIX_BTMAP_BEGIN = 1030 + TOTAL_FIX_BTMAPS - 1
FIX_PTE = FIX_BTMAP_BEGIN + 1
FIX_PMD = FIX_PTE + 1
FIX_PUD = FIX_PMD + 1
FIX_PGD = FIX_PUD + 1
*/
enum fixed_addresses {
FIX_HOLE,

/*
* Reserve a virtual window for the FDT that is 2 MB larger than the
* maximum supported size, and put it at the top of the fixmap region.
* The additional space ensures that any FDT that does not exceed
* MAX_FDT_SIZE can be mapped regardless of whether it crosses any
* 2 MB alignment boundaries.
*
* Keep this at the top so it remains 2 MB aligned.
*/
#define FIX_FDT_SIZE (MAX_FDT_SIZE + SZ_2M)
FIX_FDT_END,
FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,

FIX_EARLYCON_MEM_BASE,
FIX_TEXT_POKE0,

#ifdef CONFIG_ACPI_APEI_GHES
/* Used for GHES mapping from assorted contexts */
FIX_APEI_GHES_IRQ,
FIX_APEI_GHES_NMI,
#endif /* CONFIG_ACPI_APEI_GHES */

#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
FIX_ENTRY_TRAMP_DATA,
FIX_ENTRY_TRAMP_TEXT,
#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT))
#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
__end_of_permanent_fixed_addresses,

/*
* Temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
*/
#define NR_FIX_BTMAPS (SZ_256K / PAGE_SIZE)
#define FIX_BTMAPS_SLOTS 7
#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)

FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,

/*
* Used for kernel page table creation, so unmapped memory may be used
* for tables.
*/
FIX_PTE,
FIX_PMD,
FIX_PUD,
FIX_PGD,

__end_of_fixed_addresses
};

#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)

FDT映射的建立

FDT也就是Flattened Device Tree,它存放着我们需要的内存以及其它硬件的信息。通过它的初始化,我们可以大致明白FIXMAP以及其映射是如何实现、工作的。

early_fixmap_init()

这个函数要引入bm_pxd页表,这几个页表是静态生成的,它们将用于FIXMAP的映射。通过这个函数,我们将建立FIXADDR_START的映射,我们来看看它是如何实现的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
// ./arch/arm64/mm/mmu.c
/*
* The p*d_populate functions call virt_to_phys implicitly so they can't be used
* directly on kernel symbols (bm_p*d). This function is called too early to use
* lm_alias so __p*d_populate functions must be used to populate with the
* physical address from __pa_symbol.
*/
void __init early_fixmap_init(void)
{
pgd_t *pgdp, pgd;
pud_t *pudp;
pmd_t *pmdp;
// ----------------------------------------------------------------------------------------------------- (1)
// #define VMEMMAP_START (PAGE_OFFSET - VMEMMAP_SIZE)
// #define PCI_IO_END (VMEMMAP_START - SZ_2M)
// #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE)
// #define FIXADDR_TOP (PCI_IO_START - SZ_2M)
//
// hi_addr
// | - PCI_IO_END
// | |
// | |
// | |
// | |- PCI_IO_START
// | ||
// | SZ_2M||
// | ||
// | |- FIXADDR_TOP -|
// | | ||
// | | ||
// | | ||
// | | ||
// | | ||4MB < FIXADDR_SIZE < 6MB
// | | ||
// | | ||
// | | ||
// | | ||
// \|/ - FIXADDR_START -|
// lo_addr |
// | ...
// |
// - FIX_PTE
// |
// - FIX_PMD
// |
// - FIX_PUD
// |
// - FIX_PGD
// |
// | ...
//
// #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
// #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
//
// FIXADDR_START and FIX_PGD should be in the same PMD, otherwise we cannot use
// one PTE(bm_pte) to map them
//
unsigned long addr = FIXADDR_START; // 0XFFFF7DFFFE7F9000

// ----------------------------------------------------------------------------------------------------- (2)
// get PGD entry virtual address
pgdp = pgd_offset_k(addr); // 0XFFFF00001223C7D8
pgd = READ_ONCE(*pgdp); // pgd.pgd = 0X421AA003

// ----------------------------------------------------------------------------------------------------- (3)
// if this entry is not NULL, just use it --> this is passed in live run
if (CONFIG_PGTABLE_LEVELS > 3 &&
!(pgd_none(pgd) || pgd_page_paddr(pgd) == __pa_symbol(bm_pud))) {
/*
* We only end up here if the kernel mapping and the fixmap
* share the top level pgd entry, which should only happen on
* 16k/4 levels configurations.
*/
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
pudp = pud_offset_kimg(pgdp, addr);
} else {
// ------------------------------------------------------------------------------------------------- (4)
// if this entry is NULL, need to map it first
if (pgd_none(pgd))
__pgd_populate(pgdp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);

// ------------------------------------------------------------------------------------------------- (5)
// fixmap_pxd() 1) uses PGD/PUD/PMD to get physical base address of the next level table
// 2) uses addr to get table index and uses it to get entry's physical address
// 3) add offset to get entry's virtual address
pudp = fixmap_pud(addr); // 0XFFFF0000121AAFF8
// pudp.pud = 0X421A9003
}

// ----------------------------------------------------------------------------------------------------- (6)
// map PUD and PMD
if (pud_none(READ_ONCE(*pudp)))
__pud_populate(pudp, __pa_symbol(bm_pmd), PMD_TYPE_TABLE);
pmdp = fixmap_pmd(addr); // 0XFFFF0000121A9F98
// pmdp.pmd = 0X421A8003
__pmd_populate(pmdp, __pa_symbol(bm_pte), PMD_TYPE_TABLE);

/*
* The boot-ioremap range spans multiple pmds, for which
* we are not prepared:
*/
BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
!= (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));

if ((pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)))
|| pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) {
WARN_ON(1);
pr_warn("pmdp %p != %p, %p\n",
pmdp, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)),
fixmap_pmd(fix_to_virt(FIX_BTMAP_END)));
pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
fix_to_virt(FIX_BTMAP_BEGIN));
pr_warn("fix_to_virt(FIX_BTMAP_END): %08lx\n",
fix_to_virt(FIX_BTMAP_END));

pr_warn("FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
pr_warn("FIX_BTMAP_BEGIN: %d\n", FIX_BTMAP_BEGIN);
}
}

我们先来看看通过QEMU模拟我获得的一些地址信息:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
FIXADDR_START        = 0XFFFF7DFFFE7F9000 
// -----------------------------------------------
PGD Entry VA = 0XFFFF00001223C7D8
PGD Entry Value (PA) = 0X421AA003
// -----------------------------------------------
PUD Entry VA = 0XFFFF0000121AAFF8
PUD Entry Value (PA) = 0X421A9003
// -----------------------------------------------
PMD Entry VA = 0XFFFF0000121A9F98
PMD Entry Value (PA) = 0X421A8003
// -----------------------------------------------
init_mm.pgd VA = 0XFFFF00001223C000
init_mm.pgd[0] = 0X4223D003
init_mm.pgd[X] = 0X421AA0030 // 这个就是上面的PGD Entry,可以看出,FIXMAP的映射和内核映射用了不同的PGD entry

还有一点很有意思,不知道QEMU是如何操作的,可能在运行第一次后,这些页表就被保存了。因为当这个函数还没有被调用的时候,通过GDB已经可以看到这些页表的数据(当时还在想是不是QEMU重启时没有将内存清空,但通过观察其它一些全局变量,打消了这个想法)。

上面的(1)到(6)已经写得很清楚了,当这个函数结束时,PUD, PMD, PTE页表已经更新完毕,PUD, PMD都映射了一个entry,而PTE只映射了页表。因此,只要是映射到同一个PMDentry,就可以直接使用整个PTE了。
这些页表就是静态生成的bm_pxd页表,我们现在来看看:

1
2
3
4
// 这三个页表都有512个entry
static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;

这个页表有一个细节在下一个函数中会涉及 - FIXADDR_STARTFIX_PGD都映射到同一个PMDentry,但FIX_FDT_END却不是。更准确的说,FIX_FDTFIX_PGD(左闭右开)都映射到同一个PMDentry并且FIX_PDT恰好是临界地址。

setup_machine_fdt()

这个函数主要是处理DT相关的内容,对于内存管理,它会获取内存信息然后使用memblock来进行管理。这里我主要关心的是内存管理相关内容,因此只关注fixmap_remap_fdt()early_init_dt_scan_memory()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static void __init setup_machine_fdt(phys_addr_t dt_phys)
{
void *dt_virt = fixmap_remap_fdt(dt_phys);
const char *name;

if (!dt_virt || !early_init_dt_scan(dt_virt)) {
pr_crit("\n"
"Error: invalid device tree blob at physical address %pa (virtual address 0x%p)\n"
"The dtb must be 8-byte aligned and must not exceed 2 MB in size\n"
"\nPlease check your bootloader.",
&dt_phys, dt_virt);

while (true)
cpu_relax();
}

name = of_flat_dt_get_machine_name();
if (!name)
return;

pr_info("Machine model: %s\n", name);
dump_stack_set_arch_desc("%s (DT)", name);
}

fixmap_remap_fdt()

这个函数就是将dt_phys进行映射,它通过__fixmap_remap_fdt()完成映射,并将该区域放入memblock中的reserve区域。

1
2
3
4
5
6
7
8
9
10
11
12
void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
{
void *dt_virt;
int size;

dt_virt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO);
if (!dt_virt)
return NULL;

memblock_reserve(dt_phys, size);
return dt_virt;
}

__fixmap_remap_fdt()

这个函数最重要的就是(1)处调用的函数,它是实际进行映射的函数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
{
const u64 dt_virt_base = __fix_to_virt(FIX_FDT); // dt_virt_base = 0XFFFF7DFFFE800000
int offset;
void *dt_virt;

/*
* Check whether the physical FDT address is set and meets the minimum
* alignment requirement. Since we are relying on MIN_FDT_ALIGN to be
* at least 8 bytes so that we can always access the magic and size
* fields of the FDT header after mapping the first chunk, double check
* here if that is indeed the case.
*/
BUILD_BUG_ON(MIN_FDT_ALIGN < 8);
if (!dt_phys || dt_phys % MIN_FDT_ALIGN) // dt_phys = 0x48000000
return NULL;

/*
* Make sure that the FDT region can be mapped without the need to
* allocate additional translation table pages, so that it is safe
* to call create_mapping_noalloc() this early.
*
* On 64k pages, the FDT will be mapped using PTEs, so we need to
* be in the same PMD as the rest of the fixmap.
* On 4k pages, we'll use section mappings for the FDT so we only
* have to be in the same PUD.
*/
BUILD_BUG_ON(dt_virt_base % SZ_2M);

BUILD_BUG_ON(__fix_to_virt(FIX_FDT_END) >> SWAPPER_TABLE_SHIFT !=
__fix_to_virt(FIX_BTMAP_BEGIN) >> SWAPPER_TABLE_SHIFT);

offset = dt_phys % SWAPPER_BLOCK_SIZE; // SWAPPER_BLOCK_SIZE = 2MB
dt_virt = (void *)dt_virt_base + offset;

/* map the first chunk so we can read the size from the header */
// ----------------------------------------------------------------------------------------------------- (1)
// 我们先映射2MB,如果不够,后面将再次进行映射
create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE),
dt_virt_base, SWAPPER_BLOCK_SIZE, prot);

if (fdt_magic(dt_virt) != FDT_MAGIC)
return NULL;

*size = fdt_totalsize(dt_virt);
if (*size > MAX_FDT_SIZE)
return NULL;

if (offset + *size > SWAPPER_BLOCK_SIZE)
create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot);

return dt_virt;
}

create_mapping_noalloc()

我单独把这个函数提出来,因为这个函数会一层一层地调用很多函数。但不管怎么说,这个函数功能很简单,就是在现有页表的基础上进行VA->PA的映射(因此叫noalloc)。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/*
* This function can only be used to modify existing table entries,
* without allocating new levels of table. Note that this permits the
* creation of new section or page entries.
*/
static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
phys_addr_t size, pgprot_t prot)
{
if (virt < VMALLOC_START) {
pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
&phys, virt);
return;
}
__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, NO_CONT_MAPPINGS);
}

__create_pgd_mapping()

这是第一个被调用的函数,它会利用现有的PGD来映射PUD

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
unsigned long virt, phys_addr_t size,
pgprot_t prot,
phys_addr_t (*pgtable_alloc)(void),
int flags)
{
unsigned long addr, length, end, next;
pgd_t *pgdp = pgd_offset_raw(pgdir, virt);

/*
* If the virtual and physical address don't have the same offset
* within a page, we cannot map the region as the caller expects.
*/
if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
return;

phys &= PAGE_MASK;
addr = virt & PAGE_MASK;
length = PAGE_ALIGN(size + (virt & ~PAGE_MASK));

end = addr + length;
do {
// ------------------------------------------------------------------------------------------------- (1)
// next is the smaller one between end and next PGD entry's mapping virtual address
// here, end is much smaller
next = pgd_addr_end(addr, end);
alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc, flags);
phys += next - addr;
} while (pgdp++, addr = next, addr != end);
}

(1) 这里的nextend和下一个PGD entry映射的虚拟地址中小的一个,这里显然取end

alloc_init_pud()

这个函数用于建立PUD的映射,同PGD相同,我们这里只需要一级PUD足以。这个函数最难理解的部分在(1)处,想了很久,不知道为什么一定要这样。下面我会解释这个函数(宏)是如何工作的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
phys_addr_t (*pgtable_alloc)(void),
int flags)
{
unsigned long next;
pud_t *pudp;
pgd_t pgd = READ_ONCE(*pgdp);

if (pgd_none(pgd)) {
phys_addr_t pud_phys;
BUG_ON(!pgtable_alloc);
pud_phys = pgtable_alloc();
__pgd_populate(pgdp, pud_phys, PUD_TYPE_TABLE);
pgd = READ_ONCE(*pgdp);
}
BUG_ON(pgd_bad(pgd));

// ----------------------------------------------------------------------------------------------------- (1)
// pxd_set_fixmap_offset will do one thing - map FIX_PXD to physical address of bm_pxd
// For Example:
// bm_pud physical address: *pgdp + pud_index(addr) * sizeof(pud_t)
// mapped virtual address : FIX_PUD
//
// FIX_PUD and FIXADDR_START share the same PMD entry in bm_pmd(otherwise we need to allocate one more entry
// in bm_pmd). So FIX_PUD mapped pte is in bm_pte. After this mapping, we can use FIX_PUD instead of using this
// method: pgd -> pa of pud entry -> va of pud entry
// to access PUD table
pudp = pud_set_fixmap_offset(pgdp, addr);
do {
pud_t old_pud = READ_ONCE(*pudp);

next = pud_addr_end(addr, end);

/*
* For 4K granule only, attempt to put down a 1GB block
*/
if (use_1G_block(addr, next, phys) &&
(flags & NO_BLOCK_MAPPINGS) == 0) {
pud_set_huge(pudp, phys, prot);

/*
* After the PUD entry has been populated once, we
* only allow updates to the permission attributes.
*/
BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
READ_ONCE(pud_val(*pudp))));
} else {
alloc_init_cont_pmd(pudp, addr, next, phys, prot,
pgtable_alloc, flags);

BUG_ON(pud_val(old_pud) != 0 &&
pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
}
phys += next - addr;
} while (pudp++, addr = next, addr != end);

pud_clear_fixmap();
}

(1)注释部分已经讲的比较清楚了,我们用FIX_PUD来映射addr对应的PUD,而这里的addr就是FIX_FDT对应的虚拟地址。它映射的PUD就是bm_pud,因此FIX_PUD刚好映射到bm_pud。之后,我们就可以直接用FIX_PUD +/- Offset来访问bm_pud的不同entry

之所以我这里想了很久,主要是没理清楚bm_pxd, FIX_PXDFIX_FDT之间的关系,不明白为什么要用FIX_PUD来映射(完全可以获取PUD entry的物理地址,然后在通过OFFSET完成这个工作啊?)。不管这些细节了,在该函数之后的部分,都是通过FIX_PUD这个地址访问bm_pud。之后的FIX_MUDbm_pme也是这样的操作。

FIX_PUD映射相关宏/函数

这里简单记录下FIX_PUD是如何进行映射的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
pudp = pud_set_fixmap_offset(pgdp, addr);

#define pud_set_fixmap_offset(pgd, addr) pud_set_fixmap(pud_offset_phys(pgd, addr))

// --------------------------------------------------------------------------------------------------------- (1)
// 计算出PUD entry的物理地址
#define pud_offset_phys(dir, addr) (pgd_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t))

// --------------------------------------------------------------------------------------------------------- (2)
// 将FIX_PUD和PUD entry的物理地址进行映射
#define pud_set_fixmap(addr) ((pud_t *)set_fixmap_offset(FIX_PUD, addr))
#define set_fixmap_offset(idx, phys)\
__set_fixmap_offset(idx, phys, FIXMAP_PAGE_NORMAL)
#define __set_fixmap_offset(idx, phys, flags)\
({\
unsigned long ________addr;\
// ----------------------------------------------------------------------------------------------------- (3)
// 将FIX_PUD和PUD entry的物理地址进行映射,并直接修改PTE
// 之所以能修改PTE是因为FIX_PUD和FIXADDR_START映射到同一个PMD,因此这个函数其实并不通用
__set_fixmap(idx, phys, flags);\
// addr返回的就是PUD entry的虚拟地址, fix_to_virt(idx)映射到bm_pud,后面的offset
// 将地址指向PUD的entry
________addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1));\
________addr;\
})

/*
* Unusually, this is also called in IRQ context (ghes_iounmap_irq) so if we
* ever need to use IPIs for TLB broadcasting, then we're in trouble here.
*/
void __set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags)
{
unsigned long addr = __fix_to_virt(idx);
pte_t *ptep;

BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);

ptep = fixmap_pte(addr);
// ----------------------------------------------------------------------------------------------------- (4)
// 这里直接映射PTE
if (pgprot_val(flags)) {
set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
} else {
pte_clear(&init_mm, addr, ptep);
flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
}
}

(1) pud_offset_phys宏计算出PUD entry的物理地址;
(2) 用一系列的宏/函数将FIX_PUDPUD entry进行映射;
(3) __set_fixmap()函数最终将PTE进行更新,之后就可以通过FIX_PUD访问PUD entry了。

alloc_init_cont_pmd()和init_pmd()

有了上面的基础,下面这两个函数就很容易理解了。直接看(1)(2)即可。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
unsigned long end, phys_addr_t phys,
pgprot_t prot,
phys_addr_t (*pgtable_alloc)(void), int flags)
{
unsigned long next;
pud_t pud = READ_ONCE(*pudp);

/*
* Check for initial section mappings in the pgd/pud.
*/
BUG_ON(pud_sect(pud));
if (pud_none(pud)) {
phys_addr_t pmd_phys;
BUG_ON(!pgtable_alloc);
pmd_phys = pgtable_alloc();
__pud_populate(pudp, pmd_phys, PUD_TYPE_TABLE);
pud = READ_ONCE(*pudp);
}
BUG_ON(pud_bad(pud));

do {
pgprot_t __prot = prot;

next = pmd_cont_addr_end(addr, end);

/* use a contiguous mapping if the range is suitably aligned */
if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
(flags & NO_CONT_MAPPINGS) == 0)
__prot = __pgprot(pgprot_val(prot) | PTE_CONT);

init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);

phys += next - addr;
} while (addr = next, addr != end);
}

static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
phys_addr_t (*pgtable_alloc)(void), int flags)
{
unsigned long next;
pmd_t *pmdp;

// ----------------------------------------------------------------------------------------------------- (1)
// 同PUD一样,PMD也用相同方法将PMD和FIX_PMD进行映射
pmdp = pmd_set_fixmap_offset(pudp, addr);
do {
pmd_t old_pmd = READ_ONCE(*pmdp);

next = pmd_addr_end(addr, end);

/* try section mapping first */
if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
(flags & NO_BLOCK_MAPPINGS) == 0) {
// --------------------------------------------------------------------------------------------- (2)
// 直接进行2MB的section映射
pmd_set_huge(pmdp, phys, prot);

/*
* After the PMD entry has been populated once, we
* only allow updates to the permission attributes.
*/
BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
READ_ONCE(pmd_val(*pmdp))));
} else {
alloc_init_cont_pte(pmdp, addr, next, phys, prot,
pgtable_alloc, flags);

BUG_ON(pmd_val(old_pmd) != 0 &&
pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
}
phys += next - addr;
} while (pmdp++, addr = next, addr != end);

pmd_clear_fixmap();
}

(1) 将FIX_PMDbm_pmd进行映射;
(2) 这里我们直接按照section的方式映射2MB的内存空间。

setup_machine_fdt()的剩余部分

这部分主要就是扫描DTS并使用memblock来管理这些发现的内存空间,关于memblock我将在下一篇笔记中开始讲解。

CATALOG
  1. 1. 初看FIXMAP
  2. 2. FDT映射的建立
    1. 2.1. early_fixmap_init()
    2. 2.2. setup_machine_fdt()
      1. 2.2.1. fixmap_remap_fdt()
      2. 2.2.2. __fixmap_remap_fdt()
    3. 2.3. create_mapping_noalloc()
      1. 2.3.1. __create_pgd_mapping()
      2. 2.3.2. alloc_init_pud()
      3. 2.3.3. FIX_PUD映射相关宏/函数
      4. 2.3.4. alloc_init_cont_pmd()和init_pmd()
    4. 2.4. setup_machine_fdt()的剩余部分