Skip to content

Commit

Permalink
Boot with virtual == physical to get closer to native Linux.
Browse files Browse the repository at this point in the history
1) This allows us to get alot closer to booting bzImages.

2) It means we don't have to know page_offset.

3) The Guest needs to modify the boot pagetables to create the
   PAGE_OFFSET mapping before jumping to C code.

4) guest_pa() walks the page tables rather than using page_offset.

5) We don't use page_offset to figure out whether to emulate: it was
   always kinda quesationable, and won't work for instructions done
   before remapping (bzImage unpacking in particular).

6) We still want the kernel address for tlb flushing: have the initial
   hypercall give us that, too.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
  • Loading branch information
rustyrussell committed Oct 23, 2007
1 parent c18acd7 commit 47436aa
Show file tree
Hide file tree
Showing 12 changed files with 141 additions and 148 deletions.
134 changes: 31 additions & 103 deletions Documentation/lguest/lguest.c
Original file line number Diff line number Diff line change
Expand Up @@ -178,19 +178,16 @@ static void *get_pages(unsigned int num)
/* To find out where to start we look for the magic Guest string, which marks
* the code we see in lguest_asm.S. This is a hack which we are currently
* plotting to replace with the normal Linux entry point. */
static unsigned long entry_point(const void *start, const void *end,
unsigned long page_offset)
static unsigned long entry_point(const void *start, const void *end)
{
const void *p;

/* The scan gives us the physical starting address. We want the
* virtual address in this case, and fortunately, we already figured
* out the physical-virtual difference and passed it here in
* "page_offset". */
/* The scan gives us the physical starting address. We boot with
* pagetables set up with virtual and physical the same, so that's
* OK. */
for (p = start; p < end; p++)
if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
return to_guest_phys(p + strlen("GenuineLguest"))
+ page_offset;
return to_guest_phys(p + strlen("GenuineLguest"));

errx(1, "Is this image a genuine lguest?");
}
Expand Down Expand Up @@ -224,14 +221,11 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
* by all modern binaries on Linux including the kernel.
*
* The ELF headers give *two* addresses: a physical address, and a virtual
* address. The Guest kernel expects to be placed in memory at the physical
* address, and the page tables set up so it will correspond to that virtual
* address. We return the difference between the virtual and physical
* addresses in the "page_offset" pointer.
* address. We use the physical address; the Guest will map itself to the
* virtual address.
*
* We return the starting address. */
static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
unsigned long *page_offset)
static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
{
void *start = (void *)-1, *end = NULL;
Elf32_Phdr phdr[ehdr->e_phnum];
Expand All @@ -255,9 +249,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
err(1, "Reading program headers");

/* We don't know page_offset yet. */
*page_offset = 0;

/* Try all the headers: there are usually only three. A read-only one,
* a read-write one, and a "note" section which isn't loadable. */
for (i = 0; i < ehdr->e_phnum; i++) {
Expand All @@ -268,14 +259,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
verbose("Section %i: size %i addr %p\n",
i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);

/* We expect a simple linear address space: every segment must
* have the same difference between virtual (p_vaddr) and
* physical (p_paddr) address. */
if (!*page_offset)
*page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
errx(1, "Page offset of section %i different", i);

/* We track the first and last address we mapped, so we can
* tell entry_point() where to scan. */
if (from_guest_phys(phdr[i].p_paddr) < start)
Expand All @@ -288,50 +271,13 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
phdr[i].p_offset, phdr[i].p_filesz);
}

return entry_point(start, end, *page_offset);
}

/*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated.
*
* We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects
* to be. We don't know what that option was, but we can figure it out
* approximately by looking at the addresses in the code. I chose the common
* case of reading a memory location into the %eax register:
*
* movl <some-address>, %eax
*
* This gets encoded as five bytes: "0xA1 <4-byte-address>". For example,
* "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax.
*
* In this example can guess that the kernel was compiled with
* CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the
* kernel were larger than 16MB, we might see 0xC1 addresses show up, but our
* kernel isn't that bloated yet.
*
* Unfortunately, x86 has variable-length instructions, so finding this
* particular instruction properly involves writing a disassembler. Instead,
* we rely on statistics. We look for "0xA1" and tally the different bytes
* which occur 4 bytes later (the "0xC0" in our example above). When one of
* those bytes appears three times, we can be reasonably confident that it
* forms the start of CONFIG_PAGE_OFFSET.
*
* This is amazingly reliable. */
static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
{
unsigned int i, possibilities[256] = { 0 };

for (i = 0; i + 4 < len; i++) {
/* mov 0xXXXXXXXX,%eax */
if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
return (unsigned long)img[i+4] << 24;
}
errx(1, "could not determine page offset");
return entry_point(start, end);
}

/*L:160 Unfortunately the entire ELF image isn't compressed: the segments
* which need loading are extracted and compressed raw. This denies us the
* information we need to make a fully-general loader. */
static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
static unsigned long unpack_bzimage(int fd)
{
gzFile f;
int ret, len = 0;
Expand All @@ -352,12 +298,7 @@ static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)

verbose("Unpacked size %i addr %p\n", len, img);

/* Without the ELF header, we can't tell virtual-physical gap. This is
* CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately,
* I have a clever way of figuring it out from the code itself. */
*page_offset = intuit_page_offset(img, len);

return entry_point(img, img + len, *page_offset);
return entry_point(img, img + len);
}

/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're
Expand All @@ -368,7 +309,7 @@ static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
* The bzImage is formed by putting the decompressing code in front of the
* compressed kernel code. So we can simple scan through it looking for the
* first "gzip" header, and start decompressing from there. */
static unsigned long load_bzimage(int fd, unsigned long *page_offset)
static unsigned long load_bzimage(int fd)
{
unsigned char c;
int state = 0;
Expand Down Expand Up @@ -396,7 +337,7 @@ static unsigned long load_bzimage(int fd, unsigned long *page_offset)
if (c != 0x03)
state = -1;
else
return unpack_bzimage(fd, page_offset);
return unpack_bzimage(fd);
}
}
errx(1, "Could not find kernel in bzImage");
Expand All @@ -405,7 +346,7 @@ static unsigned long load_bzimage(int fd, unsigned long *page_offset)
/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
* come wrapped up in the self-decompressing "bzImage" format. With some funky
* coding, we can load those, too. */
static unsigned long load_kernel(int fd, unsigned long *page_offset)
static unsigned long load_kernel(int fd)
{
Elf32_Ehdr hdr;

Expand All @@ -415,10 +356,10 @@ static unsigned long load_kernel(int fd, unsigned long *page_offset)

/* If it's an ELF file, it starts with "\177ELF" */
if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
return map_elf(fd, &hdr, page_offset);
return map_elf(fd, &hdr);

/* Otherwise we assume it's a bzImage, and try to unpack it */
return load_bzimage(fd, page_offset);
return load_bzimage(fd);
}

/* This is a trivial little helper to align pages. Andi Kleen hated it because
Expand Down Expand Up @@ -463,27 +404,20 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
return len;
}

/* Once we know the address the Guest kernel expects, we can construct simple
* linear page tables for all of memory which will get the Guest far enough
/* Once we know how much memory we have, we can construct simple linear page
* tables which set virtual == physical which will get the Guest far enough
* into the boot to create its own.
*
* We lay them out of the way, just below the initrd (which is why we need to
* know its size). */
static unsigned long setup_pagetables(unsigned long mem,
unsigned long initrd_size,
unsigned long page_offset)
unsigned long initrd_size)
{
unsigned long *pgdir, *linear;
unsigned int mapped_pages, i, linear_pages;
unsigned int ptes_per_page = getpagesize()/sizeof(void *);

/* Ideally we map all physical memory starting at page_offset.
* However, if page_offset is 0xC0000000 we can only map 1G of physical
* (0xC0000000 + 1G overflows). */
if (mem <= -page_offset)
mapped_pages = mem/getpagesize();
else
mapped_pages = -page_offset/getpagesize();
mapped_pages = mem/getpagesize();

/* Each PTE page can map ptes_per_page pages: how many do we need? */
linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
Expand All @@ -500,11 +434,9 @@ static unsigned long setup_pagetables(unsigned long mem,
for (i = 0; i < mapped_pages; i++)
linear[i] = ((i * getpagesize()) | PAGE_PRESENT);

/* The top level points to the linear page table pages above. The
* entry representing page_offset points to the first one, and they
* continue from there. */
/* The top level points to the linear page table pages above. */
for (i = 0; i < mapped_pages; i += ptes_per_page) {
pgdir[(i + page_offset/getpagesize())/ptes_per_page]
pgdir[i/ptes_per_page]
= ((to_guest_phys(linear) + i*sizeof(void *))
| PAGE_PRESENT);
}
Expand Down Expand Up @@ -535,15 +467,12 @@ static void concat(char *dst, char *args[])
/* This is where we actually tell the kernel to initialize the Guest. We saw
* the arguments it expects when we looked at initialize() in lguest_user.c:
* the base of guest "physical" memory, the top physical page to allow, the
* top level pagetable, the entry point and the page_offset constant for the
* Guest. */
static int tell_kernel(unsigned long pgdir, unsigned long start,
unsigned long page_offset)
* top level pagetable and the entry point for the Guest. */
static int tell_kernel(unsigned long pgdir, unsigned long start)
{
unsigned long args[] = { LHREQ_INITIALIZE,
(unsigned long)guest_base,
guest_limit / getpagesize(),
pgdir, start, page_offset };
guest_limit / getpagesize(), pgdir, start };
int fd;

verbose("Guest: %p - %p (%#lx)\n",
Expand Down Expand Up @@ -1424,9 +1353,9 @@ static void usage(void)
/*L:105 The main routine is where the real work begins: */
int main(int argc, char *argv[])
{
/* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size
* of the (optional) initrd. */
unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0;
/* Memory, top-level pagetable, code startpoint and size of the
* (optional) initrd. */
unsigned long mem = 0, pgdir, start, initrd_size = 0;
/* A temporary and the /dev/lguest file descriptor. */
int i, c, lguest_fd;
/* The list of Guest devices, based on command line arguments. */
Expand Down Expand Up @@ -1500,8 +1429,7 @@ int main(int argc, char *argv[])
setup_console(&device_list);

/* Now we load the kernel */
start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
&page_offset);
start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));

/* Boot information is stashed at physical address 0 */
boot = from_guest_phys(0);
Expand All @@ -1518,7 +1446,7 @@ int main(int argc, char *argv[])
}

/* Set up the initial linear pagetables, starting below the initrd. */
pgdir = setup_pagetables(mem, initrd_size, page_offset);
pgdir = setup_pagetables(mem, initrd_size);

/* The Linux boot header contains an "E820" memory map: ours is a
* simple, single region. */
Expand All @@ -1535,7 +1463,7 @@ int main(int argc, char *argv[])

/* We tell the kernel to initialize the Guest: this returns the open
* /dev/lguest file descriptor. */
lguest_fd = tell_kernel(pgdir, start, page_offset);
lguest_fd = tell_kernel(pgdir, start);

/* We fork off a child process, which wakes the Launcher whenever one
* of the input file descriptors needs attention. Otherwise we would
Expand Down
1 change: 1 addition & 0 deletions arch/x86/kernel/asm-offsets_32.c
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ void foo(void)
#ifdef CONFIG_LGUEST_GUEST
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
Expand Down
7 changes: 2 additions & 5 deletions arch/x86/lguest/boot.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ struct lguest_data lguest_data = {
.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
.noirq_start = (u32)lguest_noirq_start,
.noirq_end = (u32)lguest_noirq_end,
.kernel_address = PAGE_OFFSET,
.blocked_interrupts = { 1 }, /* Block timer interrupts */
.syscall_vec = SYSCALL_VECTOR,
};
Expand Down Expand Up @@ -1033,11 +1034,7 @@ __init void lguest_init(void *boot)

/*G:070 Now we've seen all the paravirt_ops, we return to
* lguest_init() where the rest of the fairly chaotic boot setup
* occurs.
*
* The Host expects our first hypercall to tell it where our "struct
* lguest_data" is, so we do that first. */
hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
* occurs. */

/* The native boot code sets up initial page tables immediately after
* the kernel itself, and sets init_pg_tables_end so they're not
Expand Down
41 changes: 36 additions & 5 deletions arch/x86/lguest/i386_head.S
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <linux/linkage.h>
#include <linux/lguest.h>
#include <asm/lguest_hcall.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/processor-flags.h>
Expand All @@ -8,18 +9,48 @@
* looks for. The plan is that the Linux boot protocol will be extended with a
* "platform type" field which will guide us here from the normal entry point,
* but for the moment this suffices. The normal boot code uses %esi for the
* boot header, so we do too. We convert it to a virtual address by adding
* PAGE_OFFSET, and hand it to lguest_init() as its argument (ie. %eax).
* boot header, so we do too.
*
* WARNING: be very careful here! We're running at addresses equal to physical
* addesses (around 0), not above PAGE_OFFSET as most code expectes
* (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
* data.
*
* The .section line puts this code in .init.text so it will be discarded after
* boot. */
.section .init.text, "ax", @progbits
.ascii "GenuineLguest"
/* Set up initial stack. */
movl $(init_thread_union+THREAD_SIZE),%esp
/* Make initial hypercall now, so we can set up the pagetables. */
movl $LHCALL_LGUEST_INIT, %eax
movl $lguest_data - __PAGE_OFFSET, %edx
int $LGUEST_TRAP_ENTRY

/* Set up boot information pointer to hand to lguest_init(): it wants
* a virtual address. */
movl %esi, %eax
addl $__PAGE_OFFSET, %eax
jmp lguest_init

/* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
* instruction uses %esi, so we needed to save it above. */
movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi

/* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
* This means the first 128M of kernel memory will be mapped at
* PAGE_OFFSET where the kernel expects to run. This will get it far
* enough through boot to switch to its own pagetables. */
movl $32, %ecx
movl %esi, %edi
addl $((__PAGE_OFFSET >> 22) * 4), %edi
rep
movsl

/* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp


/* Jumps are relative, and we're running __PAGE_OFFSET too low at the
* moment. */
jmp lguest_init+__PAGE_OFFSET

/*G:055 We create a macro which puts the assembler code between lgstart_ and
* lgend_ markers. These templates are put in the .text section: they can't be
Expand Down
8 changes: 4 additions & 4 deletions drivers/lguest/hypercalls.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,15 +181,15 @@ static void initialize(struct lguest *lg)
/* The Guest tells us where we're not to deliver interrupts by putting
* the range of addresses into "struct lguest_data". */
if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
|| get_user(lg->noirq_end, &lg->lguest_data->noirq_end)
/* We tell the Guest that it can't use the top 4MB of virtual
* addresses used by the Switcher. */
|| put_user(4U*1024*1024, &lg->lguest_data->reserve_mem))
|| get_user(lg->noirq_end, &lg->lguest_data->noirq_end))
kill_guest(lg, "bad guest page %p", lg->lguest_data);

/* We write the current time into the Guest's data page once now. */
write_timestamp(lg);

/* page_tables.c will also do some setup. */
page_table_guest_data_init(lg);

/* This is the one case where the above accesses might have been the
* first write to a Guest page. This may have caused a copy-on-write
* fault, but the Guest might be referring to the old (read-only)
Expand Down
Loading

0 comments on commit 47436aa

Please sign in to comment.