Skip to content

Commit

Permalink
Move kernel to 0x40200000 address (1 GiB higher) in virtual memory
Browse files Browse the repository at this point in the history
This patch provides all necessary changes to move OSv kernel by 1 GiB higher
in virtual memory space to start at 0x40200000. Most changes involve adding
or subtracting 0x40000000 (OSV_KERNEL_VM_SHIFT) in all relevant places. Please
note that the kernel is still loaded at 2MiB in physical memory.

The motivation for this patch is to make as much space as possible (or just enough)
in virtual memory to allow running unmodified Linux non-PIE executables (issue #190).
Even though due to the advancement of ASLR more and more applications are PIEs (Position
Independent Executables) which are pretty well supported by OSv, there are still many
non-PIEs (Position Dependent Executables) that are out. The most prominent one is
actualy JVM whose most distributions come with tiny (~20K) bootstrap java non-PIE
executable. There are many other examples where small non-PIE executable loads
other shared libraries.

As issue #1043 explains there are at least 3 possible solutions and
this patch implements the 3rd last one described there. Please note that in future
with little effort we could provide slightly beter scheme for OSV_KERNEL_VM_SHIFT
that would allow us to place the kernel even higher at the end of the 2GiB limit
(small memory model) and thus support virtually any non-PIE built using small memory model.

Due to its impact this patch has been tested on following hypervisors:
- QEMU without KVM
- QEMU with KVM
- Firecracker
- VirtualBox 6
- VMware Player
- XEN on EC2
- XEN locally in HVM mode

Fixes #1043

Signed-off-by: Waldemar Kozaczuk <jwkozaczuk@gmail.com>
Message-Id: <20190620040707.23249-1-jwkozaczuk@gmail.com>
  • Loading branch information
wkozaczuk authored and nyh committed Jun 20, 2019
1 parent 9e34f42 commit 2a1795d
Show file tree
Hide file tree
Showing 10 changed files with 103 additions and 61 deletions.
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ gcc-sysroot = $(if $(CROSS_PREFIX), --sysroot external/$(arch)/gcc.bin) \
# To add something that will *not* be part of the main kernel, you can do:
#
# mydir/*.o EXTRA_FLAGS = <MY_STUFF>
EXTRA_FLAGS = -D__OSV_CORE__ -DOSV_KERNEL_BASE=$(kernel_base) -DOSV_LZKERNEL_BASE=$(lzkernel_base)
EXTRA_FLAGS = -D__OSV_CORE__ -DOSV_KERNEL_BASE=$(kernel_base) -DOSV_KERNEL_VM_SHIFT=$(kernel_vm_shift) -DOSV_LZKERNEL_BASE=$(lzkernel_base)
EXTRA_LIBS =
COMMON = $(autodepend) -g -Wall -Wno-pointer-arith $(CFLAGS_WERROR) -Wformat=0 -Wno-format-security \
-D __BSD_VISIBLE=1 -U _FORTIFY_SOURCE -fno-stack-protector $(INCLUDES) \
Expand Down Expand Up @@ -421,6 +421,7 @@ ifeq ($(arch),x64)
# lzkernel_base is where the compressed kernel is loaded from disk.
kernel_base := 0x200000
lzkernel_base := 0x100000
kernel_vm_base := 0x40200000

$(out)/arch/x64/boot16.o: $(out)/lzloader.elf
$(out)/boot.bin: arch/x64/boot16.ld $(out)/arch/x64/boot16.o
Expand Down Expand Up @@ -480,6 +481,7 @@ endif # x64
ifeq ($(arch),aarch64)

kernel_base := 0x40080000
kernel_vm_base := 0x40080000

include $(libfdt_base)/Makefile.libfdt
libfdt-source := $(patsubst %.c, $(libfdt_base)/%.c, $(LIBFDT_SRCS))
Expand All @@ -500,6 +502,8 @@ $(out)/loader.img: $(out)/preboot.bin $(out)/loader-stripped.elf

endif # aarch64

kernel_vm_shift := $(shell printf "0x%X" $(shell expr $$(( $(kernel_vm_base) - $(kernel_base) )) ))

$(out)/bsd/sys/crypto/rijndael/rijndael-api-fst.o: COMMON+=-fno-strict-aliasing
$(out)/bsd/sys/crypto/sha2/sha2.o: COMMON+=-fno-strict-aliasing
$(out)/bsd/sys/net/route.o: COMMON+=-fno-strict-aliasing
Expand Down Expand Up @@ -1873,6 +1877,7 @@ stage1: $(stage1_targets) links

$(out)/loader.elf: $(stage1_targets) arch/$(arch)/loader.ld $(out)/bootfs.o
$(call quiet, $(LD) -o $@ --defsym=OSV_KERNEL_BASE=$(kernel_base) \
--defsym=OSV_KERNEL_VM_BASE=$(kernel_vm_base) --defsym=OSV_KERNEL_VM_SHIFT=$(kernel_vm_shift) \
-Bdynamic --export-dynamic --eh-frame-hdr --enable-new-dtags \
$(^:%.ld=-T %.ld) \
--whole-archive \
Expand Down
39 changes: 24 additions & 15 deletions arch/x64/arch-setup.cc
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,15 @@ extern boot_time_chart boot_time;
// it by placing address of start32 at the known offset at memory
// as defined by section .start32_address in loader.ld
extern "C" void start32();
void * __attribute__((section (".start32_address"))) start32_address = reinterpret_cast<void*>(&start32);
void * __attribute__((section (".start32_address"))) start32_address =
reinterpret_cast<void*>((long)&start32 - OSV_KERNEL_VM_SHIFT);

void arch_setup_free_memory()
{
static ulong edata;
static ulong edata, edata_phys;
asm ("movl $.edata, %0" : "=rm"(edata));
edata_phys = edata - OSV_KERNEL_VM_SHIFT;

// copy to stack so we don't free it now
auto omb = *osv_multiboot_info;
auto mb = omb.mb;
Expand Down Expand Up @@ -129,13 +132,13 @@ void arch_setup_free_memory()
// page tables have been set up, so we can't reference the memory being
// freed.
for_each_e820_entry(e820_buffer, e820_size, [] (e820ent ent) {
// can't free anything below edata, it's core code.
// can't free anything below edata_phys, it's core code.
// can't free anything below kernel at this moment
if (ent.addr + ent.size <= edata) {
if (ent.addr + ent.size <= edata_phys) {
return;
}
if (intersects(ent, edata)) {
ent = truncate_below(ent, edata);
if (intersects(ent, edata_phys)) {
ent = truncate_below(ent, edata_phys);
}
// ignore anything above 1GB, we haven't mapped it yet
if (intersects(ent, initial_map)) {
Expand All @@ -149,21 +152,27 @@ void arch_setup_free_memory()
auto base = reinterpret_cast<void*>(get_mem_area_base(area));
mmu::linear_map(base, 0, initial_map, initial_map);
}
// map the core, loaded 1:1 by the boot loader
mmu::phys elf_phys = reinterpret_cast<mmu::phys>(elf_header);
elf_start = reinterpret_cast<void*>(elf_header);
elf_size = edata - elf_phys;
mmu::linear_map(elf_start, elf_phys, elf_size, OSV_KERNEL_BASE);
// Map the core, loaded by the boot loader
// In order to properly setup mapping between virtual
// and physical we need to take into account where kernel
// is loaded in physical memory - elf_phys_start - and
// where it is linked to start in virtual memory - elf_start
static mmu::phys elf_phys_start = reinterpret_cast<mmu::phys>(elf_header);
// There is simple invariant between elf_phys_start and elf_start
// as expressed by the assignment below
elf_start = reinterpret_cast<void*>(elf_phys_start + OSV_KERNEL_VM_SHIFT);
elf_size = edata_phys - elf_phys_start;
mmu::linear_map(elf_start, elf_phys_start, elf_size, OSV_KERNEL_BASE);
// get rid of the command line, before low memory is unmapped
parse_cmdline(mb);
// now that we have some free memory, we can start mapping the rest
mmu::switch_to_runtime_page_tables();
for_each_e820_entry(e820_buffer, e820_size, [] (e820ent ent) {
//
// Free the memory below elf_start which we could not before
if (ent.addr < (u64)elf_start) {
if (ent.addr + ent.size >= (u64)elf_start) {
ent = truncate_above(ent, (u64) elf_start);
// Free the memory below elf_phys_start which we could not before
if (ent.addr < (u64)elf_phys_start) {
if (ent.addr + ent.size >= (u64)elf_phys_start) {
ent = truncate_above(ent, (u64) elf_phys_start);
}
mmu::free_initial_memory_range(ent.addr, ent.size);
return;
Expand Down
41 changes: 32 additions & 9 deletions arch/x64/boot.S
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,28 @@
.align 4096
.global ident_pt_l4
ident_pt_l4:
.quad ident_pt_l3 + 0x67
# The addresses of the paging tables have to be the physical ones, so we have to
# manually subtract OSV_KERNEL_VM_SHIFT in all relevant places
.quad ident_pt_l3 + 0x67 - OSV_KERNEL_VM_SHIFT
.rept 511
.quad 0
.endr
#if OSV_KERNEL_VM_SHIFT != 0x40000000 && OSV_KERNEL_VM_SHIFT != 0
#error This code only works correctly for OSV_KERNEL_VM_SHIFT = 0x40000000 or 0
#endif
ident_pt_l3:
.quad ident_pt_l2 + 0x67
.rept 511
# Each of the 512 entries in this table maps the very 1st 512 GiB of
# virtual address space 1 GiB at a time
# The very 1st entry maps 1st GiB 1:1 by pointing to ident_pt_l2 table
# that specifies addresses of every one of 512 2MiB slots of physical memory
.quad ident_pt_l2 + 0x67 - OSV_KERNEL_VM_SHIFT
# The 2nd entry maps 2nd GiB to the same 1st GiB of physical memory by pointing
# to the same ident_pt_l2 table as the 1st entry above
# This way we effectively provide correct mapping for the kernel linked
# to start at 1 GiB + 2 MiB (0x40200000) in virtual memory and point to
# 2 MiB address (0x200000) where it starts in physical memory
.quad ident_pt_l2 + 0x67 - OSV_KERNEL_VM_SHIFT
.rept 510
.quad 0
.endr
ident_pt_l2:
Expand All @@ -42,7 +57,8 @@ ident_pt_l2:

gdt_desc:
.short gdt_end - gdt - 1
.long gdt
# subtract OSV_KERNEL_VM_SHIFT because when gdt_desc is referenced, the memory is mapped 1:1
.long gdt - OSV_KERNEL_VM_SHIFT

# Set up the 64-bit compatible version of GDT description structure
# that points to the same GDT (Global segments Descriptors Table) and
Expand All @@ -53,7 +69,8 @@ gdt_desc:
.align 8
gdt64_desc:
.short gdt_end - gdt - 1
.quad gdt
# subtract OSV_KERNEL_VM_SHIFT because when gdt64_desc is referenced, the memory is mapped 1:1
.quad gdt - OSV_KERNEL_VM_SHIFT

.align 8
gdt = . - 8
Expand All @@ -77,10 +94,12 @@ init_stack_top = .
.globl start32
.globl start32_from_64
start32:
# Because the memory is mapped 1:1 at this point, we have to manualy
# subtract OSV_KERNEL_VM_SHIFT from virtual addresses in all relevant places
# boot16.S set %eax to ELF start address, we'll use it later
mov %eax, %ebp
mov $0x0, %edi
lgdt gdt_desc
lgdt gdt_desc-OSV_KERNEL_VM_SHIFT

# Add an address the vmlinux_entry64 will jump to when
# switching from 64-bit to 32-bit mode
Expand All @@ -91,7 +110,7 @@ start32_from_64:
mov %eax, %fs
mov %eax, %gs
mov %eax, %ss
ljmp $0x18, $1f
ljmp $0x18, $1f-OSV_KERNEL_VM_SHIFT
1:
and $~7, %esp
# Enable PAE (Physical Address Extension) - ability to address 64GB
Expand All @@ -101,6 +120,9 @@ start32_from_64:

# Set root of a page table in cr3
lea ident_pt_l4, %eax
# The address of the root paging table has to be physical
# so substract OSV_KERNEL_VM_SHIFT from ident_pt_l4
sub $OSV_KERNEL_VM_SHIFT, %eax
mov %eax, %cr3

# Set long mode
Expand Down Expand Up @@ -128,7 +150,7 @@ start64:
jz start64_continue
call extract_linux_boot_params
mov $0x1000, %rbx
mov $0x200000, %rbp
mov $OSV_KERNEL_BASE, %rbp

start64_continue:
lea .bss, %rdi
Expand Down Expand Up @@ -168,6 +190,7 @@ smpboot:
mov smpboot_cr4-smpboot, %eax
mov %eax, %cr4
lea ident_pt_l4, %eax
sub $OSV_KERNEL_VM_SHIFT, %eax
mov %eax, %cr3
mov smpboot_efer-smpboot, %eax
mov smpboot_efer+4-smpboot, %edx
Expand All @@ -181,7 +204,7 @@ smpboot:

smpboot_gdt_desc:
.short gdt_end - gdt - 1
.long gdt
.long gdt - OSV_KERNEL_VM_SHIFT
.global smpboot_cr0
smpboot_cr0:
.long 0
Expand Down
3 changes: 2 additions & 1 deletion arch/x64/entry-xen.S
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

elfnote_val(XEN_ELFNOTE_ENTRY, xen_start)
elfnote_val(XEN_ELFNOTE_HYPERCALL_PAGE, hypercall_page)
elfnote_val(XEN_ELFNOTE_VIRT_BASE, 0)
elfnote_val(XEN_ELFNOTE_VIRT_BASE, OSV_KERNEL_VM_SHIFT)
elfnote_str(XEN_ELFNOTE_XEN_VERSION, "xen-3.0")
elfnote_str(XEN_ELFNOTE_GUEST_OS, "osv")
elfnote_str(XEN_ELFNOTE_GUEST_VERSION, "?.?")
Expand All @@ -50,4 +50,5 @@ xen_start:
mov %rsp, xen_bootstrap_end
mov %rsi, %rdi
call xen_init
mov $0x0, %rdi
jmp start64
50 changes: 26 additions & 24 deletions arch/x64/loader.ld
Original file line number Diff line number Diff line change
Expand Up @@ -14,77 +14,79 @@ SECTIONS
*
* We can't export the ELF header base as a symbol, because ld
* insists on moving stuff around if we do.
*
*/
. = OSV_KERNEL_VM_BASE + 0x800;
/*
* Place address of start32 routine at predefined offset in memory
*/
. = OSV_KERNEL_BASE + 0x800;
.start32_address : {
.start32_address : AT(ADDR(.start32_address) - OSV_KERNEL_VM_SHIFT) {
*(.start32_address)
}
. = OSV_KERNEL_BASE + 0x1000;
.dynamic : { *(.dynamic) } :dynamic :text
.text : {
. = OSV_KERNEL_VM_BASE + 0x1000;
.dynamic : AT(ADDR(.dynamic) - OSV_KERNEL_VM_SHIFT) { *(.dynamic) } :dynamic :text
.text : AT(ADDR(.text) - OSV_KERNEL_VM_SHIFT) {
text_start = .;
*(.text.hot .text.hot.*)
*(.text.unlikely .text.*_unlikely)
*(.text.fixup)
*(.text.startup .text.startup.*)
*(.text .text.*)
text_end = .;
PROVIDE(low_vmlinux_entry64 = vmlinux_entry64 - OSV_KERNEL_VM_SHIFT);
} :text
. = ALIGN(8);
.fixup : {
.fixup : AT(ADDR(.fixup) - OSV_KERNEL_VM_SHIFT) {
fault_fixup_start = .;
*(.fixup)
fault_fixup_end = .;
} :text

. = ALIGN(8);
.memcpy_decode : {
.memcpy_decode : AT(ADDR(.memcpy_decode) - OSV_KERNEL_VM_SHIFT) {
memcpy_decode_start = .;
*(.memcpy_decode)
memcpy_decode_end = .;
} :text

.eh_frame : { *(.eh_frame) } : text
.rodata : { *(.rodata*) } :text
.eh_frame : { *(.eh_frame) } :text
.eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame
.note : { *(.note*) } :text :note
.gcc_except_table : { *(.gcc_except_table) *(.gcc_except_table.*) } : text
.tracepoint_patch_sites ALIGN(8) : {
.eh_frame : AT(ADDR(.eh_frame) - OSV_KERNEL_VM_SHIFT) { *(.eh_frame) } : text
.rodata : AT(ADDR(.rodata) - OSV_KERNEL_VM_SHIFT) { *(.rodata*) } :text
.eh_frame : AT(ADDR(.eh_frame) - OSV_KERNEL_VM_SHIFT) { *(.eh_frame) } :text
.eh_frame_hdr : AT(ADDR(.eh_frame_hdr) - OSV_KERNEL_VM_SHIFT) { *(.eh_frame_hdr) } :text :eh_frame
.note : AT(ADDR(.note) - OSV_KERNEL_VM_SHIFT) { *(.note*) } :text :note
.gcc_except_table : AT(ADDR(.gcc_except_table) - OSV_KERNEL_VM_SHIFT) { *(.gcc_except_table) *(.gcc_except_table.*) } : text
.tracepoint_patch_sites ALIGN(8) : AT(ADDR(.tracepoint_patch_sites) - OSV_KERNEL_VM_SHIFT) {
__tracepoint_patch_sites_start = .;
*(.tracepoint_patch_sites)
__tracepoint_patch_sites_end = .;
} : text
.data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) } : text
.data : { *(.data) } :text
.data.rel.ro : AT(ADDR(.data.rel.ro) - OSV_KERNEL_VM_SHIFT) { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) } : text
.data : AT(ADDR(.data) - OSV_KERNEL_VM_SHIFT) { *(.data) } :text
_init_array_start = .;
.init_array : {
.init_array : AT(ADDR(.init_array) - OSV_KERNEL_VM_SHIFT) {
*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*))
*(.init_array .ctors)
} : text
_init_array_end = .;
. = ALIGN(4096);
.percpu : {
.percpu : AT(ADDR(.percpu) - OSV_KERNEL_VM_SHIFT) {
_percpu_start = .;
*(.percpu)
. = ALIGN(4096);
_percpu_end = .;
}
.percpu_workers : {
.percpu_workers : AT(ADDR(.percpu_workers) - OSV_KERNEL_VM_SHIFT) {
_percpu_workers_start = .;
*(.percpu_workers)
_percpu_workers_end = .;
}
. = ALIGN(64);
.tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :tls :text
.tbss : {
.tdata : AT(ADDR(.tdata) - OSV_KERNEL_VM_SHIFT) { *(.tdata .tdata.* .gnu.linkonce.td.*) } :tls :text
.tbss : AT(ADDR(.tbss) - OSV_KERNEL_VM_SHIFT) {
*(.tbss .tbss.* .gnu.linkonce.tb.*)
. = ALIGN(64);
} :tls :text
.tls_template_size = SIZEOF(.tdata) + SIZEOF(.tbss);
.bss : { *(.bss .bss.*) } :text
.bss : AT(ADDR(.bss) - OSV_KERNEL_VM_SHIFT) { *(.bss .bss.*) } :text
. = ALIGN(64);
tcb0 = .;
. = . + .tls_template_size + 256;
Expand Down Expand Up @@ -114,4 +116,4 @@ PHDRS {
eh_frame PT_GNU_EH_FRAME;
note PT_NOTE;
}
ENTRY(vmlinux_entry64);
ENTRY(low_vmlinux_entry64);
6 changes: 4 additions & 2 deletions arch/x64/vmlinux-boot64.S
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ vmlinux_entry64:
mov %rsi, %rdi

# Load the 64-bit version of the GDT
lgdt gdt64_desc
# Because the memory is mapped 1:1 at this point, we have to manualy
# subtract OSV_KERNEL_VM_SHIFT from the gdt address
lgdt gdt64_desc-OSV_KERNEL_VM_SHIFT

# Setup the stack to switch back to 32-bit mode in order
# to converge with the code that sets up transiton to 64-bit mode later.
Expand All @@ -32,6 +34,6 @@ vmlinux_entry64:
# to start32_from_64 which is where the boot process converges.
subq $8, %rsp
movl $0x18, 4(%rsp)
movl $start32_from_64, %eax
movl $start32_from_64-OSV_KERNEL_VM_SHIFT, %eax # Because memory is mapped 1:1 subtract OSV_KERNEL_VM_SHIFT
movl %eax, (%rsp)
lret
4 changes: 2 additions & 2 deletions arch/x64/xen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ void xen_init(processor::features_type &features, unsigned base)
// Base + 1 would have given us the version number, it is mostly
// uninteresting for us now
auto x = processor::cpuid(base + 2);
processor::wrmsr(x.b, cast_pointer(&hypercall_page));
processor::wrmsr(x.b, cast_pointer(&hypercall_page) - OSV_KERNEL_VM_SHIFT);

struct xen_feature_info info;
// To fill up the array used by C code
Expand All @@ -192,7 +192,7 @@ void xen_init(processor::features_type &features, unsigned base)
map.domid = DOMID_SELF;
map.idx = 0;
map.space = 0;
map.gpfn = cast_pointer(&xen_shared_info) >> 12;
map.gpfn = (cast_pointer(&xen_shared_info) - OSV_KERNEL_VM_SHIFT) >> 12;

// 7 => add to physmap
if (memory_hypercall(XENMEM_add_to_physmap, &map))
Expand Down
2 changes: 1 addition & 1 deletion core/elf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1099,7 +1099,7 @@ void create_main_program()
program::program(void* addr)
: _next_alloc(addr)
{
_core = std::make_shared<memory_image>(*this, (void*)ELF_IMAGE_START);
_core = std::make_shared<memory_image>(*this, (void*)(ELF_IMAGE_START + OSV_KERNEL_VM_SHIFT));
assert(_core->module_index() == core_module_index);
_core->load_segments();
set_search_path({"/", "/usr/lib"});
Expand Down
Loading

0 comments on commit 2a1795d

Please sign in to comment.