- early_paging_init()의 경우 특별히 메모리 정보가 변경되는 머신의 경우만 메모리 재설정 루틴이 동작된다. LPAE 설정을 사용하는 경우 phisical to virtual transalation에 대한 추가 루틴도 수행된다.
- 관련 머신: arch/arm/mach-keystone
<kernel v4.0>
reserve memblock 영역에 다음 영역들을 등록한다.
arch/arm/mm/init.c
void __init arm_memblock_init(const struct machine_desc *mdesc) { /* Register the kernel text, kernel data and initrd with memblock. */ #ifdef CONFIG_XIP_KERNEL memblock_reserve(__pa(_sdata), _end - _sdata); #else memblock_reserve(__pa(_stext), _end - _stext); #endif #ifdef CONFIG_BLK_DEV_INITRD /* FDT scan will populate initrd_start */ if (initrd_start && !phys_initrd_size) { phys_initrd_start = __virt_to_phys(initrd_start); phys_initrd_size = initrd_end - initrd_start; } initrd_start = initrd_end = 0; if (phys_initrd_size && !memblock_is_region_memory(phys_initrd_start, phys_initrd_size)) { pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region - disabling initrd\n", (u64)phys_initrd_start, phys_initrd_size); phys_initrd_start = phys_initrd_size = 0; } if (phys_initrd_size && memblock_is_region_reserved(phys_initrd_start, phys_initrd_size)) { pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region - disabling initrd\n", (u64)phys_initrd_start, phys_initrd_size); phys_initrd_start = phys_initrd_size = 0; } if (phys_initrd_size) { memblock_reserve(phys_initrd_start, phys_initrd_size); /* Now convert initrd to virtual addresses */ initrd_start = __phys_to_virt(phys_initrd_start); initrd_end = initrd_start + phys_initrd_size; } #endif arm_mm_memblock_reserve(); /* reserve any platform specific memblock areas */ if (mdesc->reserve) mdesc->reserve(); early_init_fdt_scan_reserved_mem(); /* reserve memory for DMA contiguous allocations */ dma_contiguous_reserve(arm_dma_limit); arm_memblock_steal_permitted = false; memblock_dump_all(); }
아래 그림과 같은 순서로 몇 가지 영역을 reserved memblock에 추가한다.
arch/arm/mm/mmu.c
/* * Reserve the special regions of memory */ void __init arm_mm_memblock_reserve(void) { /* * Reserve the page tables. These are already in use, * and can only be in node 0. */ memblock_reserve(__pa(swapper_pg_dir), SWAPPER_PG_DIR_SIZE); #ifdef CONFIG_SA1111 /* * Because of the SA1111 DMA bug, we want to preserve our * precious DMA-able memory... */ memblock_reserve(PHYS_OFFSET, __pa(swapper_pg_dir) - PHYS_OFFSET); #endif }
일반적인 메모리 영역에서 특정 목적으로 제외(reserve)시켜야 하는 메모리 영역을 지정한다. 아래 예를 살펴본다.
/ { #address-cells = <1>; #size-cells = <1>; memory { reg = <0x40000000 0x40000000>; }; reserved-memory { #address-cells = <1>; #size-cells = <1>; ranges; /* global autoconfigured region for contiguous allocations */ linux,cma { compatible = "shared-dma-pool"; reusable; size = <0x4000000>; alignment = <0x2000>; linux,cma-default; }; display_reserved: framebuffer@78000000 { reg = <0x78000000 0x800000>; }; multimedia_reserved: multimedia@77000000 { compatible = "acme,multimedia-memory"; reg = <0x77000000 0x4000000>; }; }; /* ... */ fb0: video@12300000 { memory-region = <&display_reserved>; /* ... */ }; scaler: scaler@12500000 { memory-region = <&multimedia_reserved>; /* ... */ }; codec: codec@12600000 { memory-region = <&multimedia_reserved>; /* ... */ }; };
drivers/of/fdt.c
/** * early_init_fdt_scan_reserved_mem() - create reserved memory regions * * This function grabs memory from early allocator for device exclusive use * defined in device tree structures. It should be called by arch specific code * once the early allocator (i.e. memblock) has been fully activated. */ void __init early_init_fdt_scan_reserved_mem(void) { int n; u64 base, size; if (!initial_boot_params) return; /* Reserve the dtb region */ early_init_dt_reserve_memory_arch(__pa(initial_boot_params), fdt_totalsize(initial_boot_params), 0); /* Process header /memreserve/ fields */ for (n = 0; ; n++) { fdt_get_mem_rsv(initial_boot_params, n, &base, &size); if (!size) break; early_init_dt_reserve_memory_arch(base, size, 0); } of_scan_flat_dt(__fdt_scan_reserved_mem, NULL); fdt_init_reserved_mem(); }
drivers/of/fdt.c
int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, bool nomap) { if (nomap) return memblock_remove(base, size); return memblock_reserve(base, size); }
drivers/of/fdt.c”
/** * fdt_scan_reserved_mem() - scan a single FDT node for reserved memory */ static int __init __fdt_scan_reserved_mem(unsigned long node, const char *uname, int depth, void *data) { static int found; const char *status; int err; if (!found && depth == 1 && strcmp(uname, "reserved-memory") == 0) { if (__reserved_mem_check_root(node) != 0) { pr_err("Reserved memory: unsupported node format, ignoring\n"); /* break scan */ return 1; } found = 1; /* scan next node */ return 0; } else if (!found) { /* scan next node */ return 0; } else if (found && depth < 2) { /* scanning of /reserved-memory has been finished */ return 1; } status = of_get_flat_dt_prop(node, "status", NULL); if (status && strcmp(status, "okay") != 0 && strcmp(status, "ok") != 0) return 0; err = __reserved_mem_reserve_reg(node, uname); if (err == -ENOENT && of_get_flat_dt_prop(node, "size", NULL)) fdt_reserved_mem_save_node(node, uname, 0, 0); /* scan next node */ return 0; }
reserved-memory { #address-cells = <1>; #size-cells = <1>; ranges; vpp_reserved: vpp_mem@5e800000 { compatible = "sirf,reserved-memory"; reg = <0x5e800000 0x800000>; }; nanddisk_reserved: nanddisk@46000000 { reg = <0x46000000 0x200000>; no-map; }; };
해당 함수는 CONFIG_OF_EARLY_FLATTREE 옵션을 사용한 경우에만 동작한다. (DTB 기본 옵션)
drivers/of/fdt.c
/** * res_mem_reserve_reg() - reserve all memory described in 'reg' property */ static int __init __reserved_mem_reserve_reg(unsigned long node, const char *uname) { int t_len = (dt_root_addr_cells + dt_root_size_cells) * sizeof(__be32); phys_addr_t base, size; int len; const __be32 *prop; int nomap, first = 1; prop = of_get_flat_dt_prop(node, "reg", &len); if (!prop) return -ENOENT; if (len && len % t_len != 0) { pr_err("Reserved memory: invalid reg property in '%s', skipping node.\n", uname); return -EINVAL; } nomap = of_get_flat_dt_prop(node, "no-map", NULL) != NULL; while (len >= t_len) { base = dt_mem_next_cell(dt_root_addr_cells, &prop); size = dt_mem_next_cell(dt_root_size_cells, &prop); if (size && early_init_dt_reserve_memory_arch(base, size, nomap) == 0) pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %ld MiB\n", uname, &base, (unsigned long)size / SZ_1M); else pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %ld MiB\n", uname, &base, (unsigned long)size / SZ_1M); len -= t_len; if (first) { fdt_reserved_mem_save_node(node, uname, base, size); first = 0; } } return 0; }
DTB를 분석하여 다음 두 가지 방법 중 하나를 사용하여 memblock에 등록한다.
drivers/of/of_reserved_mem.c
/** * res_mem_save_node() - save fdt node for second pass initialization */ void __init fdt_reserved_mem_save_node(unsigned long node, const char *uname, phys_addr_t base, phys_addr_t size) { struct reserved_mem *rmem = &reserved_mem[reserved_mem_count]; if (reserved_mem_count == ARRAY_SIZE(reserved_mem)) { pr_err("Reserved memory: not enough space all defined regions.\n"); return; } rmem->fdt_node = node; rmem->name = uname; rmem->base = base; rmem->size = size; reserved_mem_count++; return; }
drivers/of/of_reserved_mem.c
/** * fdt_init_reserved_mem - allocate and init all saved reserved memory regions */ void __init fdt_init_reserved_mem(void) { int i; for (i = 0; i < reserved_mem_count; i++) { struct reserved_mem *rmem = &reserved_mem[i]; unsigned long node = rmem->fdt_node; int len; const __be32 *prop; int err = 0; prop = of_get_flat_dt_prop(node, "phandle", &len); if (!prop) prop = of_get_flat_dt_prop(node, "linux,phandle", &len); if (prop) rmem->phandle = of_read_number(prop, len/4); if (rmem->size == 0) err = __reserved_mem_alloc_size(node, rmem->name, &rmem->base, &rmem->size); if (err == 0) __reserved_mem_init_node(rmem); } }
dynamic 방법에 의해 등록된 reserved_mem[] 배열을 읽어 DTB alloc-range 속성이 요청한 메모리 범위에서 size 속성 만큼 reserve 할 수 있는 영역을 찾고 성공한 경우 reserve memblock을 추가한다. 그런 후 각 디바이스 드라이버에 지정된 callback 함수(of_device_id->data)를 호출하여 해당 디바이스 드라이버를 초기화한다.
drivers/of/of_reserved_mem.c
/** * res_mem_alloc_size() - allocate reserved memory described by 'size', 'align' * and 'alloc-ranges' properties */ static int __init __reserved_mem_alloc_size(unsigned long node, const char *uname, phys_addr_t *res_base, phys_addr_t *res_size) { int t_len = (dt_root_addr_cells + dt_root_size_cells) * sizeof(__be32); phys_addr_t start = 0, end = 0; phys_addr_t base = 0, align = 0, size; int len; const __be32 *prop; int nomap; int ret; prop = of_get_flat_dt_prop(node, "size", &len); if (!prop) return -EINVAL; if (len != dt_root_size_cells * sizeof(__be32)) { pr_err("Reserved memory: invalid size property in '%s' node.\n", uname); return -EINVAL; } size = dt_mem_next_cell(dt_root_size_cells, &prop); nomap = of_get_flat_dt_prop(node, "no-map", NULL) != NULL; prop = of_get_flat_dt_prop(node, "alignment", &len); if (prop) { if (len != dt_root_addr_cells * sizeof(__be32)) { pr_err("Reserved memory: invalid alignment property in '%s' node.\n", uname); return -EINVAL; } align = dt_mem_next_cell(dt_root_addr_cells, &prop); }
dynamic 방법으로 alloc-ranges 속성이 지정한 범위 내에서 지정한 영역 크기를 reserve memblock에 추가한다.
prop = of_get_flat_dt_prop(node, "alloc-ranges", &len); if (prop) { if (len % t_len != 0) { pr_err("Reserved memory: invalid alloc-ranges property in '%s', skipping node.\n", uname); return -EINVAL; } base = 0; while (len > 0) { start = dt_mem_next_cell(dt_root_addr_cells, &prop); end = start + dt_mem_next_cell(dt_root_size_cells, &prop); ret = early_init_dt_alloc_reserved_memory_arch(size, align, start, end, nomap, &base); if (ret == 0) { pr_debug("Reserved memory: allocated memory for '%s' node: base %pa, size %ld MiB\n", uname, &base, (unsigned long)size / SZ_1M); break; } len -= t_len; } } else { ret = early_init_dt_alloc_reserved_memory_arch(size, align, 0, 0, nomap, &base); if (ret == 0) pr_debug("Reserved memory: allocated memory for '%s' node: base %pa, size %ld MiB\n", uname, &base, (unsigned long)size / SZ_1M); } if (base == 0) { pr_info("Reserved memory: failed to allocate memory for node '%s'\n", uname); return -ENOMEM; } *res_base = base; *res_size = size; return 0; }
drivers/of/of_reserved_mem.c
/** * res_mem_init_node() - call region specific reserved memory init code */ static int __init __reserved_mem_init_node(struct reserved_mem *rmem) { extern const struct of_device_id __reservedmem_of_table[]; const struct of_device_id *i; for (i = __reservedmem_of_table; i < &__rmem_of_table_sentinel; i++) { reservedmem_of_init_fn initfn = i->data; const char *compat = i->compatible; if (!of_flat_dt_is_compatible(rmem->fdt_node, compat)) continue; if (initfn(rmem) == 0) { pr_info("Reserved memory: initialized node %s, compatible id %s\n", rmem->name, compat); return 0; } } return -ENOENT; }
DTB의 reserved-mem 노드의 sub 노드가 사용하는 디바이스명(compat)으로 커널에 등록된 __reservedmem_of_table 에서 검색하여 해당 초기화 함수를 호출한다.
현재 커널에는 아래와 같이 두 개의 디바이스 드라이버 코드가 준비되어 있다.
CMA for DMA mapping framework 용도의 디바이스 드라이버로 사용된다.
drivers/base/dma-contiguous.c
static const struct reserved_mem_ops rmem_cma_ops = { .device_init = rmem_cma_device_init, .device_release = rmem_cma_device_release, }; static int __init rmem_cma_setup(struct reserved_mem *rmem) { phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); phys_addr_t mask = align - 1; unsigned long node = rmem->fdt_node; struct cma *cma; int err; if (!of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; if ((rmem->base & mask) || (rmem->size & mask)) { pr_err("Reserved memory: incorrect alignment of CMA region\n"); return -EINVAL; } err = cma_init_reserved_mem(rmem->base, rmem->size, 0, &cma); if (err) { pr_err("Reserved memory: unable to setup CMA region\n"); return err; } /* Architecture specific contiguous memory fixup. */ dma_contiguous_early_fixup(rmem->base, rmem->size); if (of_get_flat_dt_prop(node, "linux,cma-default", NULL)) dma_contiguous_set_default(cma); rmem->ops = &rmem_cma_ops; rmem->priv = cma; pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); return 0; } RESERVEDMEM_OF_DECLARE(cma, "shared-cma-pool", rmem_cma_setup);
RESERVEDMEM_OF_DECLARE를 통해서 __of_table_cma 이름의 of_device_id 구조체가 __reservedmem_of_table에 등록된다.
DMA for Coherent per-device 메모리 핸들링을 위한 디바이스 드라이버에 사용된다.
drivers/base/dma-coherent.c
static const struct reserved_mem_ops rmem_dma_ops = { .device_init = rmem_dma_device_init, .device_release = rmem_dma_device_release, }; static int __init rmem_dma_setup(struct reserved_mem *rmem) { unsigned long node = rmem->fdt_node; if (of_get_flat_dt_prop(node, "reusable", NULL)) return -EINVAL; #ifdef CONFIG_ARM if (!of_get_flat_dt_prop(node, "no-map", NULL)) { pr_err("Reserved memory: regions without no-map are not yet supported\n"); return -EINVAL; } #endif rmem->ops = &rmem_dma_ops; pr_info("Reserved memory: created DMA memory pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); return 0; } RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup);
include/linux/of_reserved_mem.h
struct reserved_mem { const char *name; unsigned long fdt_node; unsigned long phandle; const struct reserved_mem_ops *ops; phys_addr_t base; phys_addr_t size; void *priv; };
include/linux/of_reserved_mem.h
struct reserved_mem_ops { int (*device_init)(struct reserved_mem *rmem, struct device *dev); void (*device_release)(struct reserved_mem *rmem, struct device *dev); };
drivers/of/of_reserved_mem.c
static const struct of_device_id __rmem_of_table_sentinel __used __section(__reservedmem_of_table_end);
#define MAX_RESERVED_REGIONS 16 static struct reserved_mem reserved_mem[MAX_RESERVED_REGIONS]; static int reserved_mem_count;
이 함수에서는 등록된 memory memblock에 대해 미리 사전 체크를 하여 early memory allocator로 동작할 수 있도록 다음과 같이 준비한다.
arch/arm/mm/mmu.c
void __init sanity_check_meminfo(void) { phys_addr_t memblock_limit = 0; int highmem = 0; phys_addr_t vmalloc_limit = __pa(vmalloc_min - 1) + 1; struct memblock_region *reg; for_each_memblock(memory, reg) { phys_addr_t block_start = reg->base; phys_addr_t block_end = reg->base + reg->size; phys_addr_t size_limit = reg->size; if (reg->base >= vmalloc_limit) highmem = 1; else size_limit = vmalloc_limit - reg->base; if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) { if (highmem) { pr_notice("Ignoring RAM at %pa-%pa (!CONFIG_HIGHMEM)\n", &block_start, &block_end); memblock_remove(reg->base, reg->size); continue; } if (reg->size > size_limit) { phys_addr_t overlap_size = reg->size - size_limit; pr_notice("Truncating RAM at %pa-%pa to -%pa", &block_start, &block_end, &vmalloc_limit); memblock_remove(vmalloc_limit, overlap_size); block_end = vmalloc_limit; } }
HIGHMEM을 사용하지 않는 경우 memblock들이 vmalloc_limit을 초과하는 경우 해당 영역을 제거한다.
if (!highmem) { if (block_end > arm_lowmem_limit) { if (reg->size > size_limit) arm_lowmem_limit = vmalloc_limit; else arm_lowmem_limit = block_end; }
/* * Find the first non-pmd-aligned page, and point * memblock_limit at it. This relies on rounding the * limit down to be pmd-aligned, which happens at the * end of this function. * * With this algorithm, the start or end of almost any * bank can be non-pmd-aligned. The only exception is * that the start of the bank 0 must be section- * aligned, since otherwise memory would need to be * allocated when mapping the start of bank 0, which * occurs before any free memory is mapped. */ if (!memblock_limit) { if (!IS_ALIGNED(block_start, PMD_SIZE)) memblock_limit = block_start; else if (!IS_ALIGNED(block_end, PMD_SIZE)) memblock_limit = arm_lowmem_limit; } } } high_memory = __va(arm_lowmem_limit - 1) + 1; /* * Round the memblock limit down to a pmd size. This * helps to ensure that we will allocate memory from the * last full pmd, which should be mapped. */ if (memblock_limit) memblock_limit = round_down(memblock_limit, PMD_SIZE); if (!memblock_limit) memblock_limit = arm_lowmem_limit; memblock_set_current_limit(memblock_limit); }
해당 머신의 바뀐 메모리 정보를 위해 초기화를 수행한다. LPAE의 경우 phisical to virtual transalation이 필요하여 추가로 몇 개의 루틴들이 수행되어야 한다.
#ifdef CONFIG_ARM_LPAE /* * early_paging_init() recreates boot time page table setup, allowing machines * to switch over to a high (>4G) address space on LPAE systems */ void __init early_paging_init(const struct machine_desc *mdesc, struct proc_info_list *procinfo) { pmdval_t pmdprot = procinfo->__cpu_mm_mmu_flags; unsigned long map_start, map_end; pgd_t *pgd0, *pgdk; pud_t *pud0, *pudk, *pud_start; pmd_t *pmd0, *pmdk; phys_addr_t phys; int i; if (!(mdesc->init_meminfo)) return; /* remap kernel code and data */ map_start = init_mm.start_code & PMD_MASK; map_end = ALIGN(init_mm.brk, PMD_SIZE); /* get a handle on things... */ pgd0 = pgd_offset_k(0); pud_start = pud0 = pud_offset(pgd0, 0); pmd0 = pmd_offset(pud0, 0); pgdk = pgd_offset_k(map_start); pudk = pud_offset(pgdk, map_start); pmdk = pmd_offset(pudk, map_start); mdesc->init_meminfo(); /* Run the patch stub to update the constants */ fixup_pv_table(&__pv_table_begin, (&__pv_table_end - &__pv_table_begin) << 2); /* * Cache cleaning operations for self-modifying code * We should clean the entries by MVA but running a * for loop over every pv_table entry pointer would * just complicate the code. */ flush_cache_louis(); dsb(ishst); isb(); /* * FIXME: This code is not architecturally compliant: we modify * the mappings in-place, indeed while they are in use by this * very same code. This may lead to unpredictable behaviour of * the CPU. * * Even modifying the mappings in a separate page table does * not resolve this. * * The architecture strongly recommends that when a mapping is * changed, that it is changed by first going via an invalid * mapping and back to the new mapping. This is to ensure that * no TLB conflicts (caused by the TLB having more than one TLB * entry match a translation) can occur. However, doing that * here will result in unmapping the code we are running. */ pr_warn("WARNING: unsafe modification of in-place page tables - tainting kernel\n"); add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); /* * Remap level 1 table. This changes the physical addresses * used to refer to the level 2 page tables to the high * physical address alias, leaving everything else the same. */ for (i = 0; i < PTRS_PER_PGD; pud0++, i++) { set_pud(pud0, __pud(__pa(pmd0) | PMD_TYPE_TABLE | L_PGD_SWAPPER)); pmd0 += PTRS_PER_PMD; } /* * Remap the level 2 table, pointing the mappings at the high * physical address alias of these pages. */ phys = __pa(map_start); do { *pmdk++ = __pmd(phys | pmdprot); phys += PMD_SIZE; } while (phys < map_end); /* * Ensure that the above updates are flushed out of the cache. * This is not strictly correct; on a system where the caches * are coherent with each other, but the MMU page table walks * may not be coherent, flush_cache_all() may be a no-op, and * this will fail. */ flush_cache_all(); /* * Re-write the TTBR values to point them at the high physical * alias of the page tables. We expect __va() will work on * cpu_get_pgd(), which returns the value of TTBR0. */ cpu_switch_mm(pgd0, &init_mm); cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET); /* Finally flush any stale TLB values. */ local_flush_bp_all(); local_flush_tlb_all(); } #else void __init early_paging_init(const struct machine_desc *mdesc, struct proc_info_list *procinfo) { if (mdesc->init_meminfo) mdesc->init_meminfo(); } #endif
arch/arm/include/asm/proc-fns.h
#define cpu_switch_mm(pgd,mm) cpu_do_switch_mm(virt_to_phys(pgd),mm)
arch/arm/include/asm/glue-proc.h
#define cpu_do_switch_mm __glue(CPU_NAME,_switch_mm)
arch/arm/include/asm/proc-fns.h
#define cpu_do_switch_mm processor.switch_mm
arch/arm/mm/proc-v7-2level.S
/* * cpu_v7_switch_mm(pgd_phys, tsk) * * Set the translation table base pointer to be pgd_phys * * - pgd_phys - physical address of new TTB * * It is assumed that: * - we are not using split page tables */ ENTRY(cpu_v7_switch_mm) #ifdef CONFIG_MMU mov r2, #0 mmid r1, r1 @ get mm->context.id ALT_SMP(orr r0, r0, #TTB_FLAGS_SMP) ALT_UP(orr r0, r0, #TTB_FLAGS_UP) #ifdef CONFIG_ARM_ERRATA_430973 mcr p15, 0, r2, c7, c5, 6 @ flush BTAC/BTB #endif #ifdef CONFIG_PID_IN_CONTEXTIDR mrc p15, 0, r2, c13, c0, 1 @ read current context ID lsr r2, r2, #8 @ extract the PID bfi r1, r2, #8, #24 @ insert into new context ID #endif #ifdef CONFIG_ARM_ERRATA_754322 dsb #endif mcr p15, 0, r1, c13, c0, 1 @ set context ID isb mcr p15, 0, r0, c2, c0, 0 @ set TTB 0 isb #endif bx lr ENDPROC(cpu_v7_switch_mm)
arch/arm/include/asm/glue.h
#define ____glue(name,fn) name##fn #define __glue(name,fn) ____glue(name,fn)