/** * kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. * @size: The size of objects to be created in this cache. * @align: The required alignment for the objects. * @flags: SLAB flags * @ctor: A constructor for the objects. * @dtor: A destructor for the objects. * * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. * The @ctor is run when new pages are allocated by the cache * and the @dtor is run before the pages are handed back. * * @name must be valid until the cache is destroyed. This implies that * the module calling this has to destroy the cache before getting * unloaded. * * The flags are * * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) * to catch references to uninitialised memory. * * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check * for buffer overruns. * * %SLAB_NO_REAP - Don't automatically reap this cache when we're under * memory pressure. * * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ kmem_cache_t * kmem_cache_create(constchar *name, size_t size, size_t align, unsignedlong flags, void (*ctor)(void*, kmem_cache_t *, unsignedlong), void (*dtor)(void*, kmem_cache_t *, unsignedlong)) { size_t left_over, slab_size, ralign; kmem_cache_t *cachep = NULL;
/* * Sanity checks... these are all serious usage bugs. */ if ((!name) || in_interrupt() || (size < BYTES_PER_WORD) || (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) || (dtor && !ctor)) { printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, name); BUG(); }
if (flags & SLAB_DESTROY_BY_RCU) BUG_ON(dtor);
/* * Always checks flags, a caller might be expecting debug * support which isn't available. */ if (flags & ~CREATE_MASK) BUG();
/* Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ // 将size按BYTES_PER_WORD进行对齐 if (size & (BYTES_PER_WORD-1)) { size += (BYTES_PER_WORD-1); size &= ~(BYTES_PER_WORD-1); }
/* calculate out the final buffer alignment: */ /* 1) arch recommendation: can be overridden for debug */ if (flags & SLAB_HWCACHE_ALIGN) { /* Default alignment: as specified by the arch code. * Except if an object is really small, then squeeze multiple * objects into one cacheline. */ ralign = cache_line_size(); while (size <= ralign/2) ralign /= 2; } else { ralign = BYTES_PER_WORD; } /* 2) arch mandated alignment: disables debug if necessary */ if (ralign < ARCH_SLAB_MINALIGN) { ralign = ARCH_SLAB_MINALIGN; if (ralign > BYTES_PER_WORD) flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); } /* 3) caller mandated alignment: disables debug if necessary */ if (ralign < align) { ralign = align; if (ralign > BYTES_PER_WORD) flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); } /* 4) Store it. Note that the debug code below can reduce * the alignment to BYTES_PER_WORD. */ // 将最终确定的buffer alignment写回align align = ralign;
/* Determine if the slab management is 'on' or 'off' slab. */ if (size >= (PAGE_SIZE>>3)) /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ // 对象越大越应该将slab和对象描述符放在外面 flags |= CFLGS_OFF_SLAB;
// size再次对齐 size = ALIGN(size, align);
// 开始计算slab的内部结构 if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { /* * A VFS-reclaimable slab tends to have most allocations * as GFP_NOFS and we really don't want to have to be allocating * higher-order pages when we are unable to shrink dcache. */ cachep->gfporder = 0; cache_estimate(cachep->gfporder, size, align, flags, &left_over, &cachep->num); } else { /* * Calculate size (in pages) of slabs, and the num of objs per * slab. This could be made much more intelligent. For now, * try to avoid using high page-orders for slabs. When the * gfp() funcs are more friendly towards high-order requests, * this should be changed. */ do { unsignedint break_flag = 0; cal_wastage: cache_estimate(cachep->gfporder, size, align, flags, &left_over, &cachep->num); if (break_flag) break; if (cachep->gfporder >= MAX_GFP_ORDER) break; // 如果没有办法放下1个对象,那么就需要加大页的order if (!cachep->num) goto next; // 一种特殊的错误情况,需要减少页的order再计算一次 if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) { /* This num of objs will cause problems. */ cachep->gfporder--; break_flag++; goto cal_wastage; }
/* * Large num of objs is good, but v. large slabs are * currently bad for the gfp()s. */ if (cachep->gfporder >= slab_break_gfp_order) break;
if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder)) break; /* Acceptable internal fragmentation. */ next: cachep->gfporder++; } while (1); }
/* * If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. */ if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { flags &= ~CFLGS_OFF_SLAB; left_over -= slab_size; }
if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ // 如果slab在外部,不需要进行额外的对齐了 slab_size = cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); }
// 从这里开始计算slab颜色相关的变量 cachep->colour_off = cache_line_size(); /* Offset must be a multiple of the alignment. */ if (cachep->colour_off < align) cachep->colour_off = align; cachep->colour = left_over / cachep->colour_off; cachep->slab_size = slab_size; cachep->flags = flags; cachep->gfpflags = 0; if (flags & SLAB_CACHE_DMA) cachep->gfpflags |= GFP_DMA; spin_lock_init(&cachep->spinlock); cachep->objsize = size; /* NUMA */ INIT_LIST_HEAD(&cachep->lists.slabs_full); INIT_LIST_HEAD(&cachep->lists.slabs_partial); INIT_LIST_HEAD(&cachep->lists.slabs_free);
if (flags & CFLGS_OFF_SLAB) // slabp_cache is the general cache which contains slab descriptor // 这个函数很简单,就是遍历malloc_sizes然后返回大于slab_size最小的高速缓存 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0); cachep->ctor = ctor; cachep->dtor = dtor; cachep->name = name;
staticvoidenable_cpucache(kmem_cache_t *cachep) { int err; int limit, shared;
/* The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm * - reduce the number of spinlock operations. * - reduce the number of linked list operations on the slab and * bufctl chains: array operations are cheaper. * The numbers are guessed, we should auto-tune as described by * Bonwick. */ // 根据对象的大小确定本地高速缓存的最大空闲对象数 if (cachep->objsize > 131072) limit = 1; elseif (cachep->objsize > PAGE_SIZE) limit = 8; elseif (cachep->objsize > 1024) limit = 24; elseif (cachep->objsize > 256) limit = 54; else limit = 120;
/* Cpu bound tasks (e.g. network routing) can exhibit cpu bound * allocation behaviour: Most allocs on one cpu, most free operations * on another cpu. For these cases, an efficient object passing between * cpus is necessary. This is provided by a shared array. The array * replaces Bonwick's magazine layer. * On uniprocessor, it's functionally equivalent (but less efficient) * to a larger limit. Thus disabled by default. */ shared = 0; #ifdef CONFIG_SMP if (cachep->objsize <= PAGE_SIZE) shared = 8; #endif
#if DEBUG /* With debugging enabled, large batchcount lead to excessively * long periods with disabled local interrupts. Limit the * batchcount */ if (limit > 32) limit = 32; #endif // 只要不是SMP,shared就是0,shared的高速缓存就没对象 err = do_tune_cpucache(cachep, // kmem_cache_t* cachep limit, // int limit - 本地高速缓存的最大对象数 (limit + 1) / 2, // int batchcount - 对象填充、移动时的数量,这里设置为limit的一半 shared); // int shared if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", cachep->name, -err); }
/* fixup slab chains */ // 将slab重新放回合适的链表 if (slabp->inuse == 0) { // 这种情况什么时候会发生?free_objects > free_limit? if (cachep->lists.free_objects > cachep->free_limit) { // 减去一个slab中所有free的对象 cachep->lists.free_objects -= cachep->num; // 关于销毁等的函数单独开个笔记分析吧,太多内容了! slab_destroy(cachep, slabp); } else { list_add(&slabp->list, &list3_data_ptr(cachep, objp)->slabs_free); } } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. */ list_add_tail(&slabp->list, &list3_data_ptr(cachep, objp)->slabs_partial); } } }
// 这个函数主要用于分配CPU本地高速缓存以及CPU共享的高速缓存描述符 staticintdo_tune_cpucache(kmem_cache_t* cachep, int limit, int batchcount, int shared) { structccupdate_structnew; structarray_cache *new_shared; int i;
memset(&new.new, 0, sizeof(new.new)); // 这里用一个本地变量来临时保存分配的内存空间地址 for (i = 0; i < NR_CPUS; i++) { if (cpu_online(i)) { // 从普通高速缓存获取对象 new.new[i] = alloc_arraycache(i, limit, batchcount); if (!new.new[i]) { for (i--; i >= 0; i--) kfree(new.new[i]); return -ENOMEM; } } else { new.new[i] = NULL; } } new.cachep = cachep;
/* Need the semaphore to access the chain. */ down(&cache_chain_sem); { structlist_head *p; mm_segment_t old_fs;
old_fs = get_fs(); set_fs(KERNEL_DS); list_for_each(p, &cache_chain) { kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); char tmp; /* This happens when the module gets unloaded and doesn't destroy its slab cache and noone else reuses the vmalloc area of the module. Print a warning. */ if (__get_user(tmp,pc->name)) { printk("SLAB: cache with size %d has lost its name\n", pc->objsize); continue; } if (!strcmp(pc->name,name)) { printk("kmem_cache_create: duplicate cache %s\n",name); up(&cache_chain_sem); unlock_cpu_hotplug(); BUG(); } } set_fs(old_fs); }
/* cache setup completed, link it into the list */ list_add(&cachep->next, &cache_chain); up(&cache_chain_sem); unlock_cpu_hotplug(); opps: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", name); return cachep; }