从零到负一

【LM16】vmalloc()

2024/01/26

在之前几篇笔记中,我已经介绍了两种内存的分配方式 - 伙伴系统和slab系统。这两种内存的分配方式都有一个特点,它们都是分配连续的物理内存空间。而这篇笔记要介绍的vmalloc()却是要分配连续的虚拟内存空间,其对应的物理内存空间是可以不连续的。通过vmalloc()的介绍,我们就可以看出它的一个缺点 - 它需要重新映射物理内存和虚拟内存,并且需要一个页一个页的重新映射。这无疑影响了内存分配的性能。因此,只有在不得已时才用vmalloc(),并且尽量分配大内存。因为vmalloc()并不是一种常用的内存分配方式,因此我这里只做简单地分析。

相关结构体

vmalloc()中用了两个结构体,分别是vmap_areavm_struct,下面我们来看看这两个结构体。

vm_struct

1
2
3
4
5
6
7
8
9
10
struct vm_struct {
struct vm_struct *next; // 下一个vm_struct, 所有的vm_struct通过list连接在一起
void *addr; // vm虚拟地址空间的起始地址
unsigned long size; // vm虚拟地址空间的大小
unsigned long flags;
struct page **pages; // 指向*page数组的指针
unsigned int nr_pages; // 该vm虚拟地址空间需要的page数量
phys_addr_t phys_addr;
const void *caller;
};

vmap_area

1
2
3
4
5
6
7
8
9
10
struct vmap_area {
unsigned long va_start; // vm虚拟地址空间的起始地址
unsigned long va_end; // vm虚拟地址空间的结束地址
unsigned long flags;
struct rb_node rb_node; /* address sorted rbtree */ // 接入vmap_area_root
struct list_head list; /* address sorted list */ // 接入vmap_area_list
struct llist_node purge_list; /* "lazy purge" list */
struct vm_struct *vm; // 该area对应的vm_struct
struct rcu_head rcu_head;
};

vmalloc()的初始化

vmalloc_init()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
// ./mm/vmalloc.c

// ##########################################################
// vmalloc_init()
// ##########################################################
void __init vmalloc_init(void)
{
struct vmap_area *va;
struct vm_struct *tmp;
int i;

// 初始化vmap_block_queue和vfree_deferred
// 后者用于延迟释放
for_each_possible_cpu(i) {
struct vmap_block_queue *vbq;
struct vfree_deferred *p;

vbq = &per_cpu(vmap_block_queue, i);
spin_lock_init(&vbq->lock);
INIT_LIST_HEAD(&vbq->free);
p = &per_cpu(vfree_deferred, i);
init_llist_head(&p->list);
INIT_WORK(&p->wq, free_work);
}

/* Import existing vmlist entries. */
// 根据现有的vm_struct, 创建对应的vm_area并将vm_area插入
// vmap_area_root以及vmap_area_list
for (tmp = vmlist; tmp; tmp = tmp->next) {
va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
va->flags = VM_VM_AREA;
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;
va->vm = tmp;
__insert_vmap_area(va);
}

vmap_area_pcpu_hole = VMALLOC_END;

vmap_initialized = true;
}

// ##########################################################
// __insert_vmap_area()
// ##########################################################
static void __insert_vmap_area(struct vmap_area *va)
{
struct rb_node **p = &vmap_area_root.rb_node;
struct rb_node *parent = NULL;
struct rb_node *tmp;

// 插入vmap_area_root - 红黑树
while (*p) {
struct vmap_area *tmp_va;

parent = *p;
tmp_va = rb_entry(parent, struct vmap_area, rb_node);
if (va->va_start < tmp_va->va_end)
p = &(*p)->rb_left;
else if (va->va_end > tmp_va->va_start)
p = &(*p)->rb_right;
else
BUG();
}

rb_link_node(&va->rb_node, parent, p);
rb_insert_color(&va->rb_node, &vmap_area_root);

/* address-sort this list */
// 插入vmap_area_list - 链表
tmp = rb_prev(&va->rb_node);
if (tmp) {
struct vmap_area *prev;
prev = rb_entry(tmp, struct vmap_area, rb_node);
list_add_rcu(&va->list, &prev->list);
} else
list_add_rcu(&va->list, &vmap_area_list);
}

这部分比较简单,主要是进行初始化以及根据现有的vm_struct生成对应的vmap_area并添加入对应的红黑树和链表。

vmalloc()的分配

vmalloc()的分配函数就是vmalloc(),它是一个封装函数,通过几层封装,最后调用__vmalloc_node_range()函数。这里,为了减少无用代码,就直接从__vmalloc_node_range()函数开始。

__vmalloc_node_range()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
// ./mm/vmalloc.c

/**
* __vmalloc_node_range - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment
* @start: vm area range start
* @end: vm area range end
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
* @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
* @node: node to use for allocation or NUMA_NO_NODE
* @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
// 这里的start = VMALLOC_START; end = VMALLOC_END
void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller)
{
struct vm_struct *area;
void *addr;
unsigned long real_size = size;
// vmalloc()最小分配单元是页
size = PAGE_ALIGN(size);
if (!size || (size >> PAGE_SHIFT) > totalram_pages())
goto fail;

// ---------------------------------------------------------------------- (1)
// 分配一个vmap_area并和area(vm_struct)进行映射
area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
vm_flags, start, end, node, gfp_mask, caller);
if (!area)
goto fail;

// ---------------------------------------------------------------------- (2)
// 分配页并修改页表
addr = __vmalloc_area_node(area, gfp_mask, prot, node);
if (!addr)
return NULL;

/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
* flag. It means that vm_struct is not fully initialized.
* Now, it is fully initialized, so remove this flag here.
*/
clear_vm_uninitialized_flag(area);

kmemleak_vmalloc(area, size, gfp_mask);

return addr;

fail:
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure: %lu bytes", real_size);
return NULL;
}

我们先来看(1)处的函数,

__get_vm_area_node()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
// ./mm/vmalloc.c

static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
// 不能在中断中使用该函数
BUG_ON(in_interrupt());
// 再次将size进行页大小的对齐(向上)
size = PAGE_ALIGN(size);
if (unlikely(!size))
return NULL;

if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, get_count_order_long(size),
PAGE_SHIFT, IOREMAP_MAX_ORDER);

area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
return NULL;

if (!(flags & VM_NO_GUARD))
size += PAGE_SIZE;
// 这部分比较繁琐,就不展开了,简单来说这个函数从VMALLOC_START的地址开始,查找每个已存在的vmalloc区块的hole
// 能否容纳目前要分配内存的大小。如果在已有的vmalloc区块的中没能找到合适的hole,那么从最后一块vmalloc区块的
// 结束地址开始一个新的vmalloc区域,并将其添加入vmap_area_root和vmap_area_list
va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
// 对va和area进行相应的初始化
setup_vmalloc_vm(area, va, flags, caller);

return area;
}

这个函数完成后,我们就有了vm_structvmap_area了,接下来就该看__vmalloc_node_range()中(2)处的函数了。

__vmalloc_area_node()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
// ./mm/vmalloc.c

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
0 :
__GFP_HIGHMEM;
// 根据area的大小确定需要多少物理页
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));

area->nr_pages = nr_pages;
// 根据array_size的大小确定用什么方式获取pages的内存空间
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
PAGE_KERNEL, node, area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
// area->pages是指向物理页指针的数组
area->pages = pages;
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL;
}

for (i = 0; i < area->nr_pages; i++) {
struct page *page;

if (node == NUMA_NO_NODE)
// 如果是UMA系统,那么直接用伙伴系统获取一个页
page = alloc_page(alloc_mask|highmem_mask);
else
// 否则需要指定从哪个node获取一个页
page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);

if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
goto fail;
}
area->pages[i] = page;
if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched();
}
// ------------------------------------------- (1)
// 对分配的物理页重新进行页表的映射
if (map_vm_area(area, prot, pages))
goto fail;
return area->addr;

fail:
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
return NULL;
}

这个函数一个最重要的地方就是对物理页进行重新映射,我们接下来就看看(1)处的这个函数,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
// ./mm/vmalloc.c

int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
{
unsigned long addr = (unsigned long)area->addr;
unsigned long end = addr + get_vm_area_size(area);
int err;

err = vmap_page_range(addr, end, prot, pages);

return err > 0 ? 0 : err;
}

static int vmap_page_range(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages)
{
int ret;

ret = vmap_page_range_noflush(start, end, prot, pages);
flush_cache_vmap(start, end);
return ret;
}

// 从这里开始就是页表的一级一级地映射,这里我就不分析具体的映射过程了
/*
* Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
* will have pfns corresponding to the "pages" array.
*
* Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
*/
static int vmap_page_range_noflush(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages)
{
pgd_t *pgd;
unsigned long next;
unsigned long addr = start;
int err = 0;
int nr = 0;

BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
if (err)
return err;
} while (pgd++, addr = next, addr != end);

return nr;
}

至此,vmalloc()的初始化和分配已经分析完毕,接下来我们就来看看其释放的过程。

vmalloc()的释放

vmalloc()的释放过程比较简单,直接调用vfree()函数即可,下面我们就来看看这个函数。

vfree()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// ./mm/vmalloc.c

/**
* vfree - release memory allocated by vmalloc()
* @addr: memory base address
*
* Free the virtually continuous memory area starting at @addr, as
* obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
* NULL, no operation is performed.
*
* Must not be called in NMI context (strictly speaking, only if we don't
* have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
* conventions for vfree() arch-depenedent would be a really bad idea)
*
* May sleep if called *not* from interrupt context.
*
* NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
*/
void vfree(const void *addr)
{
BUG_ON(in_nmi());

kmemleak_free(addr);

might_sleep_if(!in_interrupt());

if (!addr)
return;
// 如果在中断中,那么就将该地址放入vfree_deferred中,
// vmalloc()的释放将被推迟
if (unlikely(in_interrupt()))
__vfree_deferred(addr);
// 如果不在中断中,那么现在就释放,并且会回收物理页(第二个参数!=0表示需要回收物理页)
else
__vunmap(addr, 1);
}

这个函数主要针对两种不同的情况对addr进行释放,第一种在中断中,第二种不在中断中。第一种的处理办法是推迟处理,而第二种的处理方法就是我接下来需要讨论的了。

__vunmap()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// ./mm/vmalloc.c

static void __vunmap(const void *addr, int deallocate_pages)
{
struct vm_struct *area;

if (!addr)
return;

if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
addr))
return;

// 通过vmap_area_root在红黑树中找到符合addr要求的vmap_area以及其对应的vm_struct
area = find_vmap_area((unsigned long)addr)->vm;
if (unlikely(!area)) {
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
return;
}

debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));

// 解除vmalloc之前做的物理页的映射
// --> free_unmap_vmap_area()
// ---> unmap_vmap_area() ---------- 解除映射
// ---> free_vmap_area_noflush() --- 将vmap_area添加入vmap_purge_list, 进行lazy_TLB处理(稍后再进行TLB的flush)
remove_vm_area(addr);
if (deallocate_pages) {
int i;

// 将vmalloc()分配的物理页释放回伙伴系统
for (i = 0; i < area->nr_pages; i++) {
struct page *page = area->pages[i];

BUG_ON(!page);
__free_pages(page, 0);
}

kvfree(area->pages);
}

kfree(area);
return;
}

vmap()

除了vmalloc()外,还有一个函数vmap()可以进行类似的工作。我们下面来看看这个函数,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// ./mm/vmalloc.c

/**
* vmap - map an array of pages into virtually contiguous space
* @pages: array of page pointers
* @count: number of pages to map
* @flags: vm_area->flags
* @prot: page protection for the mapping
*
* Maps @count pages from @pages into contiguous kernel virtual
* space.
*/
void *vmap(struct page **pages, unsigned int count,
unsigned long flags, pgprot_t prot)
{
struct vm_struct *area;
unsigned long size; /* In bytes */

might_sleep();

if (count > totalram_pages())
return NULL;

size = (unsigned long)count << PAGE_SHIFT;
// 调用__get_vm_area_node()获取vmap_area和vm_struct
area = get_vm_area_caller(size, flags, __builtin_return_address(0));
if (!area)
return NULL;
// 对物理页进行新的映射
if (map_vm_area(area, prot, pages)) {
vunmap(area->addr);
return NULL;
}

return area->addr;
}

这个函数和vmalloc()最大的区别是它将需要映射的物理页作为输入,而不需要通过伙伴系统来获取这些物理页;vmalloc()需要使用伙伴系统来获取这些页。除此之外,这两个函数区别不大,都是调用相同的函数 - __get_vm_area_node()以及map_vm_area()
至于释放,它通过调用vunmap()来完成相关工作,vunmap()的底层其实和vfree()一样,调用__vunmap()函数。

参考资料

  1. 【原创】(十二)Linux内存管理之vmap与vmalloc
  2. linux内存管理笔记(二十九)—-vmalloc
CATALOG
  1. 1. 相关结构体
    1. 1.1. vm_struct
    2. 1.2. vmap_area
  2. 2. vmalloc()的初始化
    1. 2.1. vmalloc_init()
  3. 3. vmalloc()的分配
    1. 3.1. __vmalloc_node_range()
      1. 3.1.1. __get_vm_area_node()
      2. 3.1.2. __vmalloc_area_node()
  4. 4. vmalloc()的释放
    1. 4.1. vfree()
      1. 4.1.1. __vunmap()
  5. 5. vmap()
  6. 6. 参考资料