diff --git a/Makefile b/Makefile index 314e74f450..3c5a2077a0 100644 --- a/Makefile +++ b/Makefile @@ -312,7 +312,7 @@ gcc-sysroot = $(if $(CROSS_PREFIX), --sysroot external/$(arch)/gcc.bin) \ # To add something that will *not* be part of the main kernel, you can do: # # mydir/*.o EXTRA_FLAGS = -EXTRA_FLAGS = -D__OSV_CORE__ -DOSV_KERNEL_BASE=$(kernel_base) -DOSV_LZKERNEL_BASE=$(lzkernel_base) +EXTRA_FLAGS = -D__OSV_CORE__ -DOSV_KERNEL_BASE=$(kernel_base) -DOSV_KERNEL_VM_SHIFT=$(kernel_vm_shift) -DOSV_LZKERNEL_BASE=$(lzkernel_base) EXTRA_LIBS = COMMON = $(autodepend) -g -Wall -Wno-pointer-arith $(CFLAGS_WERROR) -Wformat=0 -Wno-format-security \ -D __BSD_VISIBLE=1 -U _FORTIFY_SOURCE -fno-stack-protector $(INCLUDES) \ @@ -421,6 +421,7 @@ ifeq ($(arch),x64) # lzkernel_base is where the compressed kernel is loaded from disk. kernel_base := 0x200000 lzkernel_base := 0x100000 +kernel_vm_base := 0x40200000 $(out)/arch/x64/boot16.o: $(out)/lzloader.elf $(out)/boot.bin: arch/x64/boot16.ld $(out)/arch/x64/boot16.o @@ -480,6 +481,7 @@ endif # x64 ifeq ($(arch),aarch64) kernel_base := 0x40080000 +kernel_vm_base := 0x40080000 include $(libfdt_base)/Makefile.libfdt libfdt-source := $(patsubst %.c, $(libfdt_base)/%.c, $(LIBFDT_SRCS)) @@ -500,6 +502,8 @@ $(out)/loader.img: $(out)/preboot.bin $(out)/loader-stripped.elf endif # aarch64 +kernel_vm_shift := $(shell printf "0x%X" $(shell expr $$(( $(kernel_vm_base) - $(kernel_base) )) )) + $(out)/bsd/sys/crypto/rijndael/rijndael-api-fst.o: COMMON+=-fno-strict-aliasing $(out)/bsd/sys/crypto/sha2/sha2.o: COMMON+=-fno-strict-aliasing $(out)/bsd/sys/net/route.o: COMMON+=-fno-strict-aliasing @@ -1873,6 +1877,7 @@ stage1: $(stage1_targets) links $(out)/loader.elf: $(stage1_targets) arch/$(arch)/loader.ld $(out)/bootfs.o $(call quiet, $(LD) -o $@ --defsym=OSV_KERNEL_BASE=$(kernel_base) \ + --defsym=OSV_KERNEL_VM_BASE=$(kernel_vm_base) --defsym=OSV_KERNEL_VM_SHIFT=$(kernel_vm_shift) \ -Bdynamic --export-dynamic --eh-frame-hdr --enable-new-dtags \ $(^:%.ld=-T %.ld) \ --whole-archive \ diff --git a/arch/x64/arch-setup.cc b/arch/x64/arch-setup.cc index 62236486d3..210f2d7eff 100644 --- a/arch/x64/arch-setup.cc +++ b/arch/x64/arch-setup.cc @@ -85,12 +85,15 @@ extern boot_time_chart boot_time; // it by placing address of start32 at the known offset at memory // as defined by section .start32_address in loader.ld extern "C" void start32(); -void * __attribute__((section (".start32_address"))) start32_address = reinterpret_cast(&start32); +void * __attribute__((section (".start32_address"))) start32_address = + reinterpret_cast((long)&start32 - OSV_KERNEL_VM_SHIFT); void arch_setup_free_memory() { - static ulong edata; + static ulong edata, edata_phys; asm ("movl $.edata, %0" : "=rm"(edata)); + edata_phys = edata - OSV_KERNEL_VM_SHIFT; + // copy to stack so we don't free it now auto omb = *osv_multiboot_info; auto mb = omb.mb; @@ -129,13 +132,13 @@ void arch_setup_free_memory() // page tables have been set up, so we can't reference the memory being // freed. for_each_e820_entry(e820_buffer, e820_size, [] (e820ent ent) { - // can't free anything below edata, it's core code. + // can't free anything below edata_phys, it's core code. // can't free anything below kernel at this moment - if (ent.addr + ent.size <= edata) { + if (ent.addr + ent.size <= edata_phys) { return; } - if (intersects(ent, edata)) { - ent = truncate_below(ent, edata); + if (intersects(ent, edata_phys)) { + ent = truncate_below(ent, edata_phys); } // ignore anything above 1GB, we haven't mapped it yet if (intersects(ent, initial_map)) { @@ -149,21 +152,27 @@ void arch_setup_free_memory() auto base = reinterpret_cast(get_mem_area_base(area)); mmu::linear_map(base, 0, initial_map, initial_map); } - // map the core, loaded 1:1 by the boot loader - mmu::phys elf_phys = reinterpret_cast(elf_header); - elf_start = reinterpret_cast(elf_header); - elf_size = edata - elf_phys; - mmu::linear_map(elf_start, elf_phys, elf_size, OSV_KERNEL_BASE); + // Map the core, loaded by the boot loader + // In order to properly setup mapping between virtual + // and physical we need to take into account where kernel + // is loaded in physical memory - elf_phys_start - and + // where it is linked to start in virtual memory - elf_start + static mmu::phys elf_phys_start = reinterpret_cast(elf_header); + // There is simple invariant between elf_phys_start and elf_start + // as expressed by the assignment below + elf_start = reinterpret_cast(elf_phys_start + OSV_KERNEL_VM_SHIFT); + elf_size = edata_phys - elf_phys_start; + mmu::linear_map(elf_start, elf_phys_start, elf_size, OSV_KERNEL_BASE); // get rid of the command line, before low memory is unmapped parse_cmdline(mb); // now that we have some free memory, we can start mapping the rest mmu::switch_to_runtime_page_tables(); for_each_e820_entry(e820_buffer, e820_size, [] (e820ent ent) { // - // Free the memory below elf_start which we could not before - if (ent.addr < (u64)elf_start) { - if (ent.addr + ent.size >= (u64)elf_start) { - ent = truncate_above(ent, (u64) elf_start); + // Free the memory below elf_phys_start which we could not before + if (ent.addr < (u64)elf_phys_start) { + if (ent.addr + ent.size >= (u64)elf_phys_start) { + ent = truncate_above(ent, (u64) elf_phys_start); } mmu::free_initial_memory_range(ent.addr, ent.size); return; diff --git a/arch/x64/boot.S b/arch/x64/boot.S index 1402e5d0e4..bb157bff9e 100644 --- a/arch/x64/boot.S +++ b/arch/x64/boot.S @@ -24,13 +24,28 @@ .align 4096 .global ident_pt_l4 ident_pt_l4: - .quad ident_pt_l3 + 0x67 + # The addresses of the paging tables have to be the physical ones, so we have to + # manually subtract OSV_KERNEL_VM_SHIFT in all relevant places + .quad ident_pt_l3 + 0x67 - OSV_KERNEL_VM_SHIFT .rept 511 .quad 0 .endr +#if OSV_KERNEL_VM_SHIFT != 0x40000000 && OSV_KERNEL_VM_SHIFT != 0 +#error This code only works correctly for OSV_KERNEL_VM_SHIFT = 0x40000000 or 0 +#endif ident_pt_l3: - .quad ident_pt_l2 + 0x67 - .rept 511 + # Each of the 512 entries in this table maps the very 1st 512 GiB of + # virtual address space 1 GiB at a time + # The very 1st entry maps 1st GiB 1:1 by pointing to ident_pt_l2 table + # that specifies addresses of every one of 512 2MiB slots of physical memory + .quad ident_pt_l2 + 0x67 - OSV_KERNEL_VM_SHIFT + # The 2nd entry maps 2nd GiB to the same 1st GiB of physical memory by pointing + # to the same ident_pt_l2 table as the 1st entry above + # This way we effectively provide correct mapping for the kernel linked + # to start at 1 GiB + 2 MiB (0x40200000) in virtual memory and point to + # 2 MiB address (0x200000) where it starts in physical memory + .quad ident_pt_l2 + 0x67 - OSV_KERNEL_VM_SHIFT + .rept 510 .quad 0 .endr ident_pt_l2: @@ -42,7 +57,8 @@ ident_pt_l2: gdt_desc: .short gdt_end - gdt - 1 - .long gdt + # subtract OSV_KERNEL_VM_SHIFT because when gdt_desc is referenced, the memory is mapped 1:1 + .long gdt - OSV_KERNEL_VM_SHIFT # Set up the 64-bit compatible version of GDT description structure # that points to the same GDT (Global segments Descriptors Table) and @@ -53,7 +69,8 @@ gdt_desc: .align 8 gdt64_desc: .short gdt_end - gdt - 1 - .quad gdt + # subtract OSV_KERNEL_VM_SHIFT because when gdt64_desc is referenced, the memory is mapped 1:1 + .quad gdt - OSV_KERNEL_VM_SHIFT .align 8 gdt = . - 8 @@ -77,10 +94,12 @@ init_stack_top = . .globl start32 .globl start32_from_64 start32: + # Because the memory is mapped 1:1 at this point, we have to manualy + # subtract OSV_KERNEL_VM_SHIFT from virtual addresses in all relevant places # boot16.S set %eax to ELF start address, we'll use it later mov %eax, %ebp mov $0x0, %edi - lgdt gdt_desc + lgdt gdt_desc-OSV_KERNEL_VM_SHIFT # Add an address the vmlinux_entry64 will jump to when # switching from 64-bit to 32-bit mode @@ -91,7 +110,7 @@ start32_from_64: mov %eax, %fs mov %eax, %gs mov %eax, %ss - ljmp $0x18, $1f + ljmp $0x18, $1f-OSV_KERNEL_VM_SHIFT 1: and $~7, %esp # Enable PAE (Physical Address Extension) - ability to address 64GB @@ -101,6 +120,9 @@ start32_from_64: # Set root of a page table in cr3 lea ident_pt_l4, %eax + # The address of the root paging table has to be physical + # so substract OSV_KERNEL_VM_SHIFT from ident_pt_l4 + sub $OSV_KERNEL_VM_SHIFT, %eax mov %eax, %cr3 # Set long mode @@ -128,7 +150,7 @@ start64: jz start64_continue call extract_linux_boot_params mov $0x1000, %rbx - mov $0x200000, %rbp + mov $OSV_KERNEL_BASE, %rbp start64_continue: lea .bss, %rdi @@ -168,6 +190,7 @@ smpboot: mov smpboot_cr4-smpboot, %eax mov %eax, %cr4 lea ident_pt_l4, %eax + sub $OSV_KERNEL_VM_SHIFT, %eax mov %eax, %cr3 mov smpboot_efer-smpboot, %eax mov smpboot_efer+4-smpboot, %edx @@ -181,7 +204,7 @@ smpboot: smpboot_gdt_desc: .short gdt_end - gdt - 1 - .long gdt + .long gdt - OSV_KERNEL_VM_SHIFT .global smpboot_cr0 smpboot_cr0: .long 0 diff --git a/arch/x64/entry-xen.S b/arch/x64/entry-xen.S index 11f72da4cd..813422846b 100644 --- a/arch/x64/entry-xen.S +++ b/arch/x64/entry-xen.S @@ -23,7 +23,7 @@ elfnote_val(XEN_ELFNOTE_ENTRY, xen_start) elfnote_val(XEN_ELFNOTE_HYPERCALL_PAGE, hypercall_page) -elfnote_val(XEN_ELFNOTE_VIRT_BASE, 0) +elfnote_val(XEN_ELFNOTE_VIRT_BASE, OSV_KERNEL_VM_SHIFT) elfnote_str(XEN_ELFNOTE_XEN_VERSION, "xen-3.0") elfnote_str(XEN_ELFNOTE_GUEST_OS, "osv") elfnote_str(XEN_ELFNOTE_GUEST_VERSION, "?.?") @@ -50,4 +50,5 @@ xen_start: mov %rsp, xen_bootstrap_end mov %rsi, %rdi call xen_init + mov $0x0, %rdi jmp start64 diff --git a/arch/x64/loader.ld b/arch/x64/loader.ld index caae1f68d6..8b82b1bb06 100644 --- a/arch/x64/loader.ld +++ b/arch/x64/loader.ld @@ -14,16 +14,17 @@ SECTIONS * * We can't export the ELF header base as a symbol, because ld * insists on moving stuff around if we do. - * + */ + . = OSV_KERNEL_VM_BASE + 0x800; + /* * Place address of start32 routine at predefined offset in memory */ - . = OSV_KERNEL_BASE + 0x800; - .start32_address : { + .start32_address : AT(ADDR(.start32_address) - OSV_KERNEL_VM_SHIFT) { *(.start32_address) } - . = OSV_KERNEL_BASE + 0x1000; - .dynamic : { *(.dynamic) } :dynamic :text - .text : { + . = OSV_KERNEL_VM_BASE + 0x1000; + .dynamic : AT(ADDR(.dynamic) - OSV_KERNEL_VM_SHIFT) { *(.dynamic) } :dynamic :text + .text : AT(ADDR(.text) - OSV_KERNEL_VM_SHIFT) { text_start = .; *(.text.hot .text.hot.*) *(.text.unlikely .text.*_unlikely) @@ -31,60 +32,61 @@ SECTIONS *(.text.startup .text.startup.*) *(.text .text.*) text_end = .; + PROVIDE(low_vmlinux_entry64 = vmlinux_entry64 - OSV_KERNEL_VM_SHIFT); } :text . = ALIGN(8); - .fixup : { + .fixup : AT(ADDR(.fixup) - OSV_KERNEL_VM_SHIFT) { fault_fixup_start = .; *(.fixup) fault_fixup_end = .; } :text . = ALIGN(8); - .memcpy_decode : { + .memcpy_decode : AT(ADDR(.memcpy_decode) - OSV_KERNEL_VM_SHIFT) { memcpy_decode_start = .; *(.memcpy_decode) memcpy_decode_end = .; } :text - .eh_frame : { *(.eh_frame) } : text - .rodata : { *(.rodata*) } :text - .eh_frame : { *(.eh_frame) } :text - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame - .note : { *(.note*) } :text :note - .gcc_except_table : { *(.gcc_except_table) *(.gcc_except_table.*) } : text - .tracepoint_patch_sites ALIGN(8) : { + .eh_frame : AT(ADDR(.eh_frame) - OSV_KERNEL_VM_SHIFT) { *(.eh_frame) } : text + .rodata : AT(ADDR(.rodata) - OSV_KERNEL_VM_SHIFT) { *(.rodata*) } :text + .eh_frame : AT(ADDR(.eh_frame) - OSV_KERNEL_VM_SHIFT) { *(.eh_frame) } :text + .eh_frame_hdr : AT(ADDR(.eh_frame_hdr) - OSV_KERNEL_VM_SHIFT) { *(.eh_frame_hdr) } :text :eh_frame + .note : AT(ADDR(.note) - OSV_KERNEL_VM_SHIFT) { *(.note*) } :text :note + .gcc_except_table : AT(ADDR(.gcc_except_table) - OSV_KERNEL_VM_SHIFT) { *(.gcc_except_table) *(.gcc_except_table.*) } : text + .tracepoint_patch_sites ALIGN(8) : AT(ADDR(.tracepoint_patch_sites) - OSV_KERNEL_VM_SHIFT) { __tracepoint_patch_sites_start = .; *(.tracepoint_patch_sites) __tracepoint_patch_sites_end = .; } : text - .data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) } : text - .data : { *(.data) } :text + .data.rel.ro : AT(ADDR(.data.rel.ro) - OSV_KERNEL_VM_SHIFT) { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) } : text + .data : AT(ADDR(.data) - OSV_KERNEL_VM_SHIFT) { *(.data) } :text _init_array_start = .; - .init_array : { + .init_array : AT(ADDR(.init_array) - OSV_KERNEL_VM_SHIFT) { *(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)) *(.init_array .ctors) } : text _init_array_end = .; . = ALIGN(4096); - .percpu : { + .percpu : AT(ADDR(.percpu) - OSV_KERNEL_VM_SHIFT) { _percpu_start = .; *(.percpu) . = ALIGN(4096); _percpu_end = .; } - .percpu_workers : { + .percpu_workers : AT(ADDR(.percpu_workers) - OSV_KERNEL_VM_SHIFT) { _percpu_workers_start = .; *(.percpu_workers) _percpu_workers_end = .; } . = ALIGN(64); - .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :tls :text - .tbss : { + .tdata : AT(ADDR(.tdata) - OSV_KERNEL_VM_SHIFT) { *(.tdata .tdata.* .gnu.linkonce.td.*) } :tls :text + .tbss : AT(ADDR(.tbss) - OSV_KERNEL_VM_SHIFT) { *(.tbss .tbss.* .gnu.linkonce.tb.*) . = ALIGN(64); } :tls :text .tls_template_size = SIZEOF(.tdata) + SIZEOF(.tbss); - .bss : { *(.bss .bss.*) } :text + .bss : AT(ADDR(.bss) - OSV_KERNEL_VM_SHIFT) { *(.bss .bss.*) } :text . = ALIGN(64); tcb0 = .; . = . + .tls_template_size + 256; @@ -114,4 +116,4 @@ PHDRS { eh_frame PT_GNU_EH_FRAME; note PT_NOTE; } -ENTRY(vmlinux_entry64); +ENTRY(low_vmlinux_entry64); diff --git a/arch/x64/vmlinux-boot64.S b/arch/x64/vmlinux-boot64.S index 230afd3ce0..12047513c0 100644 --- a/arch/x64/vmlinux-boot64.S +++ b/arch/x64/vmlinux-boot64.S @@ -13,7 +13,9 @@ vmlinux_entry64: mov %rsi, %rdi # Load the 64-bit version of the GDT - lgdt gdt64_desc + # Because the memory is mapped 1:1 at this point, we have to manualy + # subtract OSV_KERNEL_VM_SHIFT from the gdt address + lgdt gdt64_desc-OSV_KERNEL_VM_SHIFT # Setup the stack to switch back to 32-bit mode in order # to converge with the code that sets up transiton to 64-bit mode later. @@ -32,6 +34,6 @@ vmlinux_entry64: # to start32_from_64 which is where the boot process converges. subq $8, %rsp movl $0x18, 4(%rsp) - movl $start32_from_64, %eax + movl $start32_from_64-OSV_KERNEL_VM_SHIFT, %eax # Because memory is mapped 1:1 subtract OSV_KERNEL_VM_SHIFT movl %eax, (%rsp) lret diff --git a/arch/x64/xen.cc b/arch/x64/xen.cc index 462c266c58..c02bf62c2c 100644 --- a/arch/x64/xen.cc +++ b/arch/x64/xen.cc @@ -172,7 +172,7 @@ void xen_init(processor::features_type &features, unsigned base) // Base + 1 would have given us the version number, it is mostly // uninteresting for us now auto x = processor::cpuid(base + 2); - processor::wrmsr(x.b, cast_pointer(&hypercall_page)); + processor::wrmsr(x.b, cast_pointer(&hypercall_page) - OSV_KERNEL_VM_SHIFT); struct xen_feature_info info; // To fill up the array used by C code @@ -192,7 +192,7 @@ void xen_init(processor::features_type &features, unsigned base) map.domid = DOMID_SELF; map.idx = 0; map.space = 0; - map.gpfn = cast_pointer(&xen_shared_info) >> 12; + map.gpfn = (cast_pointer(&xen_shared_info) - OSV_KERNEL_VM_SHIFT) >> 12; // 7 => add to physmap if (memory_hypercall(XENMEM_add_to_physmap, &map)) diff --git a/core/elf.cc b/core/elf.cc index fc2ee0c368..477a017794 100644 --- a/core/elf.cc +++ b/core/elf.cc @@ -1099,7 +1099,7 @@ void create_main_program() program::program(void* addr) : _next_alloc(addr) { - _core = std::make_shared(*this, (void*)ELF_IMAGE_START); + _core = std::make_shared(*this, (void*)(ELF_IMAGE_START + OSV_KERNEL_VM_SHIFT)); assert(_core->module_index() == core_module_index); _core->load_segments(); set_search_path({"/", "/usr/lib"}); diff --git a/core/mmu.cc b/core/mmu.cc index f9294125a7..575834ca3f 100644 --- a/core/mmu.cc +++ b/core/mmu.cc @@ -91,12 +91,12 @@ phys pte_level_mask(unsigned level) return ~((phys(1) << shift) - 1); } +static void *elf_phys_start = (void*)OSV_KERNEL_BASE; void* phys_to_virt(phys pa) { - // The ELF is mapped 1:1 void* phys_addr = reinterpret_cast(pa); - if ((phys_addr >= elf_start) && (phys_addr < elf_start + elf_size)) { - return phys_addr; + if ((phys_addr >= elf_phys_start) && (phys_addr < elf_phys_start + elf_size)) { + return (void*)(phys_addr + OSV_KERNEL_VM_SHIFT); } return phys_mem + pa; @@ -106,9 +106,8 @@ phys virt_to_phys_pt(void* virt); phys virt_to_phys(void *virt) { - // The ELF is mapped 1:1 if ((virt >= elf_start) && (virt < elf_start + elf_size)) { - return reinterpret_cast(virt); + return reinterpret_cast((void*)(virt - OSV_KERNEL_VM_SHIFT)); } #if CONF_debug_memory diff --git a/loader.cc b/loader.cc index 7d88e609b5..7ac99ef54b 100644 --- a/loader.cc +++ b/loader.cc @@ -102,7 +102,8 @@ void premain() arch_init_premain(); - auto inittab = elf::get_init(elf_header); + auto inittab = elf::get_init(reinterpret_cast( + (void*)elf_header + OSV_KERNEL_VM_SHIFT)); if (inittab.tls.start == nullptr) { debug_early("premain: failed to get TLS data from ELF\n");