FIXMAP和相关页表的创建

硬核技术 Linux内核 Linux_5_0 ARMv8_64 Linux内存管理

 2023/01/15 

接上篇笔记，在上一篇笔记中，我们完成了最初的几个页表的创建，接下来就要开始start_kernel的工作了。到目前为止，内核还不知道硬件有多少内存空间，也不知道其它硬件的信息。Linux内核通过DTS文件来识别硬件并获取相关信息。这里有个重要的信息就是内存的大小。

下面这个例子来自LoyenWang的博客

// arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
memory@80000000 {
                 device_type = "memory"; 
                 reg = <0x00000000 0x80000000 0 0x80000000>; /* DRAM space - 1, size : 2 GB DRAM */ 
                };

有了这些信息，内核就知道内存的大小并且可以开始管理内存了。但这里还有一个问题，这些信息在启动时会被复制到内存空间，因此我们知道其物理地址。但这部分信息并不属于内核，因此我们还没有对其进行VA->PA的转换。没有这个转换，我们就不能访问DTS的信息。因此，我们首先要做的就是建立这个映射。当然，除了DTS，Linux内核还会对一些其它的地址进行映射。因此，内核专门搞了一个FIXMAP来完成这些空间的映射。

初看FIXMAP

我们首先需要看看这个FIXMAP到底是怎样分配的。在物理内存空间中，FIXMAP按照下面这个枚举的顺序排列的，FIX_HOLE在最上面，FIX_PGD在最下面，它的最小单位是PAGA_SIZE。从下面注释中可以看出，除了FDT，其它部分基本都只占一个或者几个页。要访问某个部分也很简单，直接用FIXADDR_TOP - idx * PAGE_SIZE即可，类似于数组的模式。

// ./arch/arm64/include/asm/fixmap.h

/*
 * Here we define all the compile-time 'special' virtual
 * addresses. The point is to have a constant address at
 * compile time, but to set the physical address only
 * in the boot process.
 *
 * These 'compile-time allocated' memory buffers are
 * page-sized. Use set_fixmap(idx,phys) to associate
 * physical memory with fixmap indices.
 *
 */

/*
    FIX_HOLE                    = 0
    FIX_FDT_END                 = 1
    FIX_FDT     = 4MB / 4KB - 1 = 1023
    FIX_EARLYCON_MEM_BASE       = 1024
    FIX_TEXT_POKE0              = 1025
    FIX_APEI_GHES_IRQ           = 1026
    FIX_APEI_GHES_NMI           = 1027
    FIX_ENTRY_TRAMP_DATA        = 1028
    FIX_ENTRY_TRAMP_TEXT        = 1029
    __end_of_permanent_fixed_addresses = 1030   <----------------- 1030 * 4KB 
    FIX_BTMAP_END               = 1030 
    FIX_BTMAP_BEGIN             = 1030 + TOTAL_FIX_BTMAPS - 1
    FIX_PTE                     = FIX_BTMAP_BEGIN + 1 
    FIX_PMD                     = FIX_PTE + 1 
    FIX_PUD                     = FIX_PMD + 1
    FIX_PGD                     = FIX_PUD + 1
*/
enum fixed_addresses {
    FIX_HOLE,

    /*
     * Reserve a virtual window for the FDT that is 2 MB larger than the
     * maximum supported size, and put it at the top of the fixmap region.
     * The additional space ensures that any FDT that does not exceed
     * MAX_FDT_SIZE can be mapped regardless of whether it crosses any
     * 2 MB alignment boundaries.
     *
     * Keep this at the top so it remains 2 MB aligned.
     */
#define FIX_FDT_SIZE (MAX_FDT_SIZE + SZ_2M)
    FIX_FDT_END,
    FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,

    FIX_EARLYCON_MEM_BASE,
    FIX_TEXT_POKE0,

#ifdef CONFIG_ACPI_APEI_GHES
    /* Used for GHES mapping from assorted contexts */
    FIX_APEI_GHES_IRQ,
    FIX_APEI_GHES_NMI,
#endif /* CONFIG_ACPI_APEI_GHES */

#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
    FIX_ENTRY_TRAMP_DATA,
    FIX_ENTRY_TRAMP_TEXT,
#define TRAMP_VALIAS        (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT))
#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
    __end_of_permanent_fixed_addresses,

    /*
     * Temporary boot-time mappings, used by early_ioremap(),
     * before ioremap() is functional.
     */
#define NR_FIX_BTMAPS        (SZ_256K / PAGE_SIZE)
#define FIX_BTMAPS_SLOTS    7
#define TOTAL_FIX_BTMAPS    (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)

    FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
    FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,

    /*
     * Used for kernel page table creation, so unmapped memory may be used
     * for tables.
     */
    FIX_PTE,
    FIX_PMD,
    FIX_PUD,
    FIX_PGD,

    __end_of_fixed_addresses
};

#define FIXADDR_SIZE    (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START    (FIXADDR_TOP - FIXADDR_SIZE)

FDT映射的建立

FDT也就是Flattened Device Tree，它存放着我们需要的内存以及其它硬件的信息。通过它的初始化，我们可以大致明白FIXMAP以及其映射是如何实现、工作的。

early_fixmap_init()

这个函数要引入bm_pxd页表，这几个页表是静态生成的，它们将用于FIXMAP的映射。通过这个函数，我们将建立FIXADDR_START的映射，我们来看看它是如何实现的。

// ./arch/arm64/mm/mmu.c
/*
 * The p*d_populate functions call virt_to_phys implicitly so they can't be used
 * directly on kernel symbols (bm_p*d). This function is called too early to use
 * lm_alias so __p*d_populate functions must be used to populate with the
 * physical address from __pa_symbol.
 */
void __init early_fixmap_init(void)
{
    pgd_t *pgdp, pgd;
    pud_t *pudp;
    pmd_t *pmdp;
    // ----------------------------------------------------------------------------------------------------- (1)
    //    #define VMEMMAP_START  (PAGE_OFFSET - VMEMMAP_SIZE)
    //    #define PCI_IO_END     (VMEMMAP_START - SZ_2M)
    //    #define PCI_IO_START   (PCI_IO_END - PCI_IO_SIZE)
    //    #define FIXADDR_TOP     (PCI_IO_START - SZ_2M)
    //    
    //   hi_addr 
    //     |        - PCI_IO_END
    //     |        |
    //     |        |
    //     |        |
    //     |       |- PCI_IO_START
    //     |       ||
    //     |  SZ_2M|| 
    //     |       ||  
    //     |       |- FIXADDR_TOP   -|
    //     |        |               ||
    //     |        |               ||
    //     |        |               ||
    //     |        |               ||
    //     |        |               ||4MB < FIXADDR_SIZE < 6MB
    //     |        |               ||
    //     |        |               ||
    //     |        |               ||
    //     |        |               ||  
    //    \|/       - FIXADDR_START -|          
    //   lo_addr    |
    //              | ...
    //              | 
    //              - FIX_PTE
    //              |
    //              - FIX_PMD
    //              |
    //              - FIX_PUD
    //              |
    //              - FIX_PGD
    //              |
    //              | ...
    //  
    //    #define FIXADDR_SIZE   (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
    //    #define FIXADDR_START  (FIXADDR_TOP - FIXADDR_SIZE)
    //
    //    FIXADDR_START and FIX_PGD should be in the same PMD, otherwise we cannot use 
    //    one PTE(bm_pte) to map them
    //
    unsigned long addr = FIXADDR_START; // 0XFFFF7DFFFE7F9000 
    
    // ----------------------------------------------------------------------------------------------------- (2)
    // get PGD entry virtual address
    pgdp = pgd_offset_k(addr); // 0XFFFF00001223C7D8 
    pgd = READ_ONCE(*pgdp);    // pgd.pgd = 0X421AA003
    
    // ----------------------------------------------------------------------------------------------------- (3)
    // if this entry is not NULL, just use it --> this is passed in live run
    if (CONFIG_PGTABLE_LEVELS > 3 &&
        !(pgd_none(pgd) || pgd_page_paddr(pgd) == __pa_symbol(bm_pud))) {
        /*
         * We only end up here if the kernel mapping and the fixmap
         * share the top level pgd entry, which should only happen on
         * 16k/4 levels configurations.
         */
        BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
        pudp = pud_offset_kimg(pgdp, addr);
    } else {
        // ------------------------------------------------------------------------------------------------- (4)
        // if this entry is NULL, need to map it first
        if (pgd_none(pgd))
            __pgd_populate(pgdp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
        
        // ------------------------------------------------------------------------------------------------- (5)
        // fixmap_pxd() 1) uses PGD/PUD/PMD to get physical base address of the next level table 
        //              2) uses addr to get table index and uses it to get entry's physical address
        //              3) add offset to get entry's virtual address
        pudp = fixmap_pud(addr); // 0XFFFF0000121AAFF8
                                 // pudp.pud = 0X421A9003
    }
    
    // ----------------------------------------------------------------------------------------------------- (6)
    // map PUD and PMD
    if (pud_none(READ_ONCE(*pudp)))
        __pud_populate(pudp, __pa_symbol(bm_pmd), PMD_TYPE_TABLE);
    pmdp = fixmap_pmd(addr); // 0XFFFF0000121A9F98
                             // pmdp.pmd = 0X421A8003
    __pmd_populate(pmdp, __pa_symbol(bm_pte), PMD_TYPE_TABLE);

    /*
     * The boot-ioremap range spans multiple pmds, for which
     * we are not prepared:
     */
    BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
             != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));

    if ((pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)))
         || pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) {
        WARN_ON(1);
        pr_warn("pmdp %p != %p, %p\n",
            pmdp, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)),
            fixmap_pmd(fix_to_virt(FIX_BTMAP_END)));
        pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
            fix_to_virt(FIX_BTMAP_BEGIN));
        pr_warn("fix_to_virt(FIX_BTMAP_END):   %08lx\n",
            fix_to_virt(FIX_BTMAP_END));

        pr_warn("FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
        pr_warn("FIX_BTMAP_BEGIN:     %d\n", FIX_BTMAP_BEGIN);
    }
}

我们先来看看通过QEMU模拟我获得的一些地址信息：

FIXADDR_START        = 0XFFFF7DFFFE7F9000 
// -----------------------------------------------
PGD Entry VA         = 0XFFFF00001223C7D8 
PGD Entry Value (PA) = 0X421AA003
// -----------------------------------------------
PUD Entry VA         = 0XFFFF0000121AAFF8
PUD Entry Value (PA) = 0X421A9003
// -----------------------------------------------
PMD Entry VA         = 0XFFFF0000121A9F98
PMD Entry Value (PA) = 0X421A8003
// -----------------------------------------------
init_mm.pgd VA       = 0XFFFF00001223C000
init_mm.pgd[0]       = 0X4223D003
init_mm.pgd[X]       = 0X421AA0030 // 这个就是上面的PGD Entry，可以看出，FIXMAP的映射和内核映射用了不同的PGD entry

还有一点很有意思，不知道QEMU是如何操作的，可能在运行第一次后，这些页表就被保存了。因为当这个函数还没有被调用的时候，通过GDB已经可以看到这些页表的数据（当时还在想是不是QEMU重启时没有将内存清空，但通过观察其它一些全局变量，打消了这个想法）。

上面的(1)到(6)已经写得很清楚了，当这个函数结束时，PUD, PMD, PTE页表已经更新完毕，PUD, PMD都映射了一个entry，而PTE只映射了页表。因此，只要是映射到同一个PMD的entry，就可以直接使用整个PTE了。
这些页表就是静态生成的bm_pxd页表，我们现在来看看：

// 这三个页表都有512个entry
static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;

这个页表有一个细节在下一个函数中会涉及 - FIXADDR_START和FIX_PGD都映射到同一个PMD的entry，但FIX_FDT_END却不是。更准确的说，FIX_FDT到FIX_PGD（左闭右开）都映射到同一个PMD的entry并且FIX_PDT恰好是临界地址。

setup_machine_fdt()

这个函数主要是处理DT相关的内容，对于内存管理，它会获取内存信息然后使用memblock来进行管理。这里我主要关心的是内存管理相关内容，因此只关注fixmap_remap_fdt()和early_init_dt_scan_memory()。

static void __init setup_machine_fdt(phys_addr_t dt_phys)
{
    void *dt_virt = fixmap_remap_fdt(dt_phys);
    const char *name;

    if (!dt_virt || !early_init_dt_scan(dt_virt)) {
        pr_crit("\n"
            "Error: invalid device tree blob at physical address %pa (virtual address 0x%p)\n"
            "The dtb must be 8-byte aligned and must not exceed 2 MB in size\n"
            "\nPlease check your bootloader.",
            &dt_phys, dt_virt);

        while (true)
            cpu_relax();
    }

    name = of_flat_dt_get_machine_name();
    if (!name)
        return;

    pr_info("Machine model: %s\n", name);
    dump_stack_set_arch_desc("%s (DT)", name);
}

fixmap_remap_fdt()

这个函数就是将dt_phys进行映射，它通过__fixmap_remap_fdt()完成映射，并将该区域放入memblock中的reserve区域。

void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
{
    void *dt_virt;
    int size;

    dt_virt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO);
    if (!dt_virt)
        return NULL;

    memblock_reserve(dt_phys, size);
    return dt_virt;
}

__fixmap_remap_fdt()

这个函数最重要的就是(1)处调用的函数，它是实际进行映射的函数。

void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
{
    const u64 dt_virt_base = __fix_to_virt(FIX_FDT); // dt_virt_base = 0XFFFF7DFFFE800000
    int offset;
    void *dt_virt;

    /*
     * Check whether the physical FDT address is set and meets the minimum
     * alignment requirement. Since we are relying on MIN_FDT_ALIGN to be
     * at least 8 bytes so that we can always access the magic and size
     * fields of the FDT header after mapping the first chunk, double check
     * here if that is indeed the case.
     */
    BUILD_BUG_ON(MIN_FDT_ALIGN < 8);
    if (!dt_phys || dt_phys % MIN_FDT_ALIGN) // dt_phys = 0x48000000
        return NULL;

    /*
     * Make sure that the FDT region can be mapped without the need to
     * allocate additional translation table pages, so that it is safe
     * to call create_mapping_noalloc() this early.
     *
     * On 64k pages, the FDT will be mapped using PTEs, so we need to
     * be in the same PMD as the rest of the fixmap.
     * On 4k pages, we'll use section mappings for the FDT so we only
     * have to be in the same PUD.
     */
    BUILD_BUG_ON(dt_virt_base % SZ_2M);

    BUILD_BUG_ON(__fix_to_virt(FIX_FDT_END) >> SWAPPER_TABLE_SHIFT !=
             __fix_to_virt(FIX_BTMAP_BEGIN) >> SWAPPER_TABLE_SHIFT);

    offset = dt_phys % SWAPPER_BLOCK_SIZE; // SWAPPER_BLOCK_SIZE = 2MB 
    dt_virt = (void *)dt_virt_base + offset;

    /* map the first chunk so we can read the size from the header */
    // ----------------------------------------------------------------------------------------------------- (1)
    // 我们先映射2MB，如果不够，后面将再次进行映射
    create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE),
            dt_virt_base, SWAPPER_BLOCK_SIZE, prot);

    if (fdt_magic(dt_virt) != FDT_MAGIC)
        return NULL;

    *size = fdt_totalsize(dt_virt);
    if (*size > MAX_FDT_SIZE)
        return NULL;

    if (offset + *size > SWAPPER_BLOCK_SIZE)
        create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
                   round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot);

    return dt_virt;
}

create_mapping_noalloc()

我单独把这个函数提出来，因为这个函数会一层一层地调用很多函数。但不管怎么说，这个函数功能很简单，就是在现有页表的基础上进行VA->PA的映射（因此叫noalloc)。

/*
 * This function can only be used to modify existing table entries,
 * without allocating new levels of table. Note that this permits the
 * creation of new section or page entries.
 */
static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
                  phys_addr_t size, pgprot_t prot)
{
    if (virt < VMALLOC_START) {
        pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
            &phys, virt);
        return;
    }
    __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, NO_CONT_MAPPINGS);
}

__create_pgd_mapping()

这是第一个被调用的函数，它会利用现有的PGD来映射PUD。

static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
                 unsigned long virt, phys_addr_t size,
                 pgprot_t prot,
                 phys_addr_t (*pgtable_alloc)(void),
                 int flags)
{
    unsigned long addr, length, end, next;
    pgd_t *pgdp = pgd_offset_raw(pgdir, virt);

    /*
     * If the virtual and physical address don't have the same offset
     * within a page, we cannot map the region as the caller expects.
     */
    if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
        return;

    phys &= PAGE_MASK;
    addr = virt & PAGE_MASK;
    length = PAGE_ALIGN(size + (virt & ~PAGE_MASK));

    end = addr + length;
    do {
        // ------------------------------------------------------------------------------------------------- (1)
        // next is the smaller one between end and next PGD entry's mapping virtual address 
        // here, end is much smaller
        next = pgd_addr_end(addr, end);
        alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc, flags);
        phys += next - addr;
    } while (pgdp++, addr = next, addr != end);
}

(1) 这里的next取end和下一个PGD entry映射的虚拟地址中小的一个，这里显然取end。

alloc_init_pud()

这个函数用于建立PUD的映射，同PGD相同，我们这里只需要一级PUD足以。这个函数最难理解的部分在(1)处，想了很久，不知道为什么一定要这样。下面我会解释这个函数（宏）是如何工作的。

static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
               phys_addr_t phys, pgprot_t prot,
               phys_addr_t (*pgtable_alloc)(void),
               int flags)
{
    unsigned long next;
    pud_t *pudp;
    pgd_t pgd = READ_ONCE(*pgdp);

    if (pgd_none(pgd)) {
        phys_addr_t pud_phys;
        BUG_ON(!pgtable_alloc);
        pud_phys = pgtable_alloc();
        __pgd_populate(pgdp, pud_phys, PUD_TYPE_TABLE);
        pgd = READ_ONCE(*pgdp);
    }
    BUG_ON(pgd_bad(pgd));
    
    // ----------------------------------------------------------------------------------------------------- (1)
    // pxd_set_fixmap_offset will do one thing - map FIX_PXD to physical address of bm_pxd
    // For Example: 
    // bm_pud physical address: *pgdp + pud_index(addr) * sizeof(pud_t) 
    // mapped virtual address : FIX_PUD
    //
    // FIX_PUD and FIXADDR_START share the same PMD entry in bm_pmd(otherwise we need to allocate one more entry
    // in bm_pmd). So FIX_PUD mapped pte is in bm_pte. After this mapping, we can use FIX_PUD instead of using this 
    // method: pgd -> pa of pud entry -> va of pud entry
    // to access PUD table
    pudp = pud_set_fixmap_offset(pgdp, addr);
    do {
        pud_t old_pud = READ_ONCE(*pudp);

        next = pud_addr_end(addr, end);

        /*
         * For 4K granule only, attempt to put down a 1GB block
         */
        if (use_1G_block(addr, next, phys) &&
            (flags & NO_BLOCK_MAPPINGS) == 0) {
            pud_set_huge(pudp, phys, prot);

            /*
             * After the PUD entry has been populated once, we
             * only allow updates to the permission attributes.
             */
            BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
                              READ_ONCE(pud_val(*pudp))));
        } else {
            alloc_init_cont_pmd(pudp, addr, next, phys, prot,
                        pgtable_alloc, flags);

            BUG_ON(pud_val(old_pud) != 0 &&
                   pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
        }
        phys += next - addr;
    } while (pudp++, addr = next, addr != end);

    pud_clear_fixmap();
}

(1)注释部分已经讲的比较清楚了，我们用FIX_PUD来映射addr对应的PUD，而这里的addr就是FIX_FDT对应的虚拟地址。它映射的PUD就是bm_pud，因此FIX_PUD刚好映射到bm_pud。之后，我们就可以直接用FIX_PUD +/- Offset来访问bm_pud的不同entry。

之所以我这里想了很久，主要是没理清楚bm_pxd, FIX_PXD和FIX_FDT之间的关系，不明白为什么要用FIX_PUD来映射（完全可以获取PUD entry的物理地址，然后在通过OFFSET完成这个工作啊？）。不管这些细节了，在该函数之后的部分，都是通过FIX_PUD这个地址访问bm_pud。之后的FIX_MUD和bm_pme也是这样的操作。

FIX_PUD映射相关宏/函数

这里简单记录下FIX_PUD是如何进行映射的。

pudp = pud_set_fixmap_offset(pgdp, addr);

#define pud_set_fixmap_offset(pgd, addr)    pud_set_fixmap(pud_offset_phys(pgd, addr))

// --------------------------------------------------------------------------------------------------------- (1)
// 计算出PUD entry的物理地址 
#define pud_offset_phys(dir, addr)          (pgd_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t))

// --------------------------------------------------------------------------------------------------------- (2)
// 将FIX_PUD和PUD entry的物理地址进行映射
#define pud_set_fixmap(addr)                ((pud_t *)set_fixmap_offset(FIX_PUD, addr))
#define set_fixmap_offset(idx, phys)\
    __set_fixmap_offset(idx, phys, FIXMAP_PAGE_NORMAL)
#define __set_fixmap_offset(idx, phys, flags)\
({\
    unsigned long ________addr;\
    // ----------------------------------------------------------------------------------------------------- (3)
    // 将FIX_PUD和PUD entry的物理地址进行映射，并直接修改PTE
    // 之所以能修改PTE是因为FIX_PUD和FIXADDR_START映射到同一个PMD，因此这个函数其实并不通用
    __set_fixmap(idx, phys, flags);\
    // addr返回的就是PUD entry的虚拟地址, fix_to_virt(idx)映射到bm_pud，后面的offset
    // 将地址指向PUD的entry
    ________addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1));\
    ________addr;\
})

/*
 * Unusually, this is also called in IRQ context (ghes_iounmap_irq) so if we
 * ever need to use IPIs for TLB broadcasting, then we're in trouble here.
 */
void __set_fixmap(enum fixed_addresses idx,
                   phys_addr_t phys, pgprot_t flags)
{
    unsigned long addr = __fix_to_virt(idx);
    pte_t *ptep;

    BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);

    ptep = fixmap_pte(addr);
    // ----------------------------------------------------------------------------------------------------- (4)
    // 这里直接映射PTE
    if (pgprot_val(flags)) {
        set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
    } else {
        pte_clear(&init_mm, addr, ptep);
        flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
    }
}

(1) pud_offset_phys宏计算出PUD entry的物理地址；
(2) 用一系列的宏/函数将FIX_PUD和PUD entry进行映射；
(3) __set_fixmap()函数最终将PTE进行更新，之后就可以通过FIX_PUD访问PUD entry了。

alloc_init_cont_pmd()和init_pmd()

有了上面的基础，下面这两个函数就很容易理解了。直接看(1)(2)即可。

static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
                unsigned long end, phys_addr_t phys,
                pgprot_t prot,
                phys_addr_t (*pgtable_alloc)(void), int flags)
{
    unsigned long next;
    pud_t pud = READ_ONCE(*pudp);

    /*
     * Check for initial section mappings in the pgd/pud.
     */
    BUG_ON(pud_sect(pud));
    if (pud_none(pud)) {
        phys_addr_t pmd_phys;
        BUG_ON(!pgtable_alloc);
        pmd_phys = pgtable_alloc();
        __pud_populate(pudp, pmd_phys, PUD_TYPE_TABLE);
        pud = READ_ONCE(*pudp);
    }
    BUG_ON(pud_bad(pud));

    do {
        pgprot_t __prot = prot;

        next = pmd_cont_addr_end(addr, end);

        /* use a contiguous mapping if the range is suitably aligned */
        if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
            (flags & NO_CONT_MAPPINGS) == 0)
            __prot = __pgprot(pgprot_val(prot) | PTE_CONT);

        init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);

        phys += next - addr;
    } while (addr = next, addr != end);
}

static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
             phys_addr_t phys, pgprot_t prot,
             phys_addr_t (*pgtable_alloc)(void), int flags)
{
    unsigned long next;
    pmd_t *pmdp;

    // ----------------------------------------------------------------------------------------------------- (1)
    // 同PUD一样，PMD也用相同方法将PMD和FIX_PMD进行映射
    pmdp = pmd_set_fixmap_offset(pudp, addr);
    do {
        pmd_t old_pmd = READ_ONCE(*pmdp);

        next = pmd_addr_end(addr, end);

        /* try section mapping first */
        if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
            (flags & NO_BLOCK_MAPPINGS) == 0) {
            // --------------------------------------------------------------------------------------------- (2)
            // 直接进行2MB的section映射
            pmd_set_huge(pmdp, phys, prot);

            /*
             * After the PMD entry has been populated once, we
             * only allow updates to the permission attributes.
             */
            BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
                              READ_ONCE(pmd_val(*pmdp))));
        } else {
            alloc_init_cont_pte(pmdp, addr, next, phys, prot,
                        pgtable_alloc, flags);

            BUG_ON(pmd_val(old_pmd) != 0 &&
                   pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
        }
        phys += next - addr;
    } while (pmdp++, addr = next, addr != end);

    pmd_clear_fixmap();
}

(1) 将FIX_PMD和bm_pmd进行映射；
(2) 这里我们直接按照section的方式映射2MB的内存空间。

setup_machine_fdt()的剩余部分

这部分主要就是扫描DTS并使用memblock来管理这些发现的内存空间，关于memblock我将在下一篇笔记中开始讲解。

Next Post

【MEM04】memblock
Previous Post

ARMv8 MMU和Linux的启动

CATALOG

1. 初看FIXMAP
2. FDT映射的建立