Skip to content

Commit 23c63ea

Browse files
author
Dan Cross
committed
mmu: Pseudo-recursive page tables
This prototypes support for self-referential mappings by making use of pseudo-recursive page tables. These are "standard" page tables, but we allocate some portion of the address space they define to what we call "the recursive region". We then map the page tables that make up the non-recursive regions of the address space into the recursive region, using addresses derived from the addresses those tables map. These maps are set up early in boot, before entry to main, when setting up the mapping for the per-node data and per-CPU Mach structure for ccNUMA node 0 and CPU 0, respectively. Signed-off-by: Dan Cross <cross@gajendra.net>
1 parent 34c57ae commit 23c63ea

File tree

9 files changed

+405
-87
lines changed

9 files changed

+405
-87
lines changed

x86_64/lib/kernel.ld

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ SECTIONS {
1212
* start of the kernel for critical structures.
1313
*/
1414
. = ${LOAD-ADDRESS};
15-
1615
PROVIDE(text = .);
1716
.text : AT (ADDR(.text) - KZERO) {
1817
*(.text.boot)

x86_64/src/cpu.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ pub(crate) unsafe fn ltr(selector: u16) {
5454
}
5555

5656
/// Loads the "Global Table Descriptor Register" (`GDTR`) with
57-
/// the base address and inclusive limit inclusive of a "Global
58-
/// Descriptor Table" (GDT).
57+
/// the base address and inclusive limit of a "Global Descriptor
58+
/// Table" (GDT).
5959
///
6060
/// # Safety
6161
/// The referred GDT must be architecturally valid.
@@ -139,7 +139,8 @@ pub(crate) unsafe fn wrmsr(msr: u32, value: u64) {
139139
///
140140
/// # Safety
141141
/// The caller must ensure that the given value makes sense as
142-
/// a %gs segment base value.
142+
/// a %gs segment base value. Note that we assume we can use
143+
/// the `WRGSBASE` instruction.
143144
pub(crate) unsafe fn wrgsbase(value: u64) {
144145
unsafe {
145146
asm!("wrgsbase {}", in(reg) value, options(att_syntax));

x86_64/src/dat.rs

Lines changed: 61 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,16 @@ use zerocopy::FromZeros;
77
pub const UREG_TRAPNO_OFFSET: usize = 19 * core::mem::size_of::<u64>();
88
pub const UREG_CS_OFFSET: usize = 22 * core::mem::size_of::<u64>();
99

10+
#[derive(Clone, Copy, Debug, FromZeros)]
11+
#[repr(transparent)]
12+
pub struct HPA(pub u64);
13+
14+
impl HPA {
15+
pub fn from_phys(pa: u64) -> HPA {
16+
HPA(pa)
17+
}
18+
}
19+
1020
/// The user register and trap frame structure.
1121
///
1222
/// This stores both user state during system calls, and
@@ -89,11 +99,36 @@ impl Default for Label {
8999

90100
/// The machine structure, which describes a CPU.
91101
///
102+
/// The small stacks, for specific exceptions, come first and
103+
/// occupy the first 64KiB of the Mach. These have guard pages
104+
/// immediately _after_ their data areas, which serve as guards
105+
/// for the subsequent data. Since the Mach is mapped in a
106+
/// per-CPU portion of virtual address space that is offset from
107+
/// a 2MiB boundary, we know that the first stack is de facto
108+
/// guarded.
109+
///
110+
/// The main kstack for the scheduler comes next, and occupies
111+
/// the next 64KiB. It is guarded by the NMI stack's guard page.
112+
///
113+
/// Thus, stacks and their guards take up the first 128KiB of
114+
/// virtual space in the Mach.
115+
///
116+
/// Next, we have the smaller Mach data itself; this includes the
117+
/// space required for the structures used in the pseudo-recursive
118+
/// page table implementation.
119+
///
92120
/// Warning: the layout of this structure is known to assembly
93121
/// language.
94122
#[derive(FromZeros)]
95123
#[repr(C, align(65536))]
96124
pub struct Mach {
125+
pub debug_stack: ExStack, // Stack for debug exceptions
126+
pub bp_stack: ExStack, // Stack for breakpoint exceptions
127+
pub df_stack: ExStack, // Stack for double faults
128+
pub nmi_stack: ExStack, // Stack for NMIs
129+
pub zero: Page, // Mapped to (per-node) read-only zeroed page
130+
pub stack: KStack, // Kernel stack for scheduler
131+
97132
me: *mut Mach, // %gs:0 is a `*mut Mach` pointing to this `Mach`.
98133
scratch: usize, // A scratch word used on entry to kernel
99134
splpc: usize, // PC of last caller to ` k`. Cleared by `spllo`.
@@ -114,24 +149,22 @@ pub struct Mach {
114149

115150
sched: Label,
116151

117-
// Architecturally defined.
118-
pub tss: Tss,
119-
120-
// All preceding data fits within a single 4KiB page. Structures
121-
// that follow are sized in page multiples and aligned.
122-
pml4: Page, // PML4 root page table for this Mach
123-
pml3: Page, // The PML3 that maps the kernel for this mach
124-
pml2: Page, // PML2 for low 1GiB
125-
pml1: Page, // PML1 for low 2MiB
126-
pub idt: Idt, // Interrupt descriptor table
127-
zero: Page, // Read-only, zeroed page
128-
pub df_stack: ExStack, // Stack for double faults
129-
pub debug_stack: ExStack, // Stack for debug exceptions
130-
pub nmi_stack: ExStack, // Stack for NMIs
131-
pub stack: KStack, // Kernel stack for scheduler
132-
pub gdt: Gdt, // Gdt is aligned to 64KiB.
152+
// All preceding data after the stack fits within a single 4KiB page.
153+
// Paging structures that follow are sized in page multiples and aligned.
154+
pml4: PTable, // PML4 root page table for this Mach
155+
pml3: PTable, // PML3 for rec 512GiB (root of all subtrees)
156+
pml2: PTable, // PML2 for rec 1GiB
157+
pml1: PTable, // PML1 for rec 2MiB (PML4 and all PML3s)
158+
mpml3: PTable, // PML3 for mapping region
159+
mpml2: PTable, // PML2 for mapping region
160+
mpml1: PTable, // PML1 for mapping region
161+
162+
// Architecturally defined data.
163+
pub tss: Tss, // Truly per-CPU
164+
idt: Idt, // Mapped to per-node IDT
165+
gdt: Gdt, // Mapped to per-CPU GDT; padded to 64k with zeros
133166
}
134-
static_assertions::const_assert_eq!(core::mem::offset_of!(Mach, pml4), 4096);
167+
static_assertions::const_assert_eq!(core::mem::offset_of!(Mach, pml4), 65536 * 2 + 4096);
135168
static_assertions::const_assert_eq!(core::mem::offset_of!(Mach, stack), 65536);
136169

137170
impl Mach {
@@ -142,10 +175,9 @@ impl Mach {
142175
(0, &mut self.stack),
143176
(trap::NMI_TRAPNO, &mut self.nmi_stack),
144177
(trap::DEBUG_TRAPNO, &mut self.debug_stack),
178+
(trap::BREAKPOINT_TRAPNO, &mut self.bp_stack),
145179
(trap::DOUBLE_FAULT_TRAPNO, &mut self.df_stack),
146180
]);
147-
self.gdt.init(&self.tss);
148-
self.idt.init(trap::stubs());
149181
unsafe {
150182
self.gdt.load();
151183
self.idt.load();
@@ -224,6 +256,16 @@ impl Flags {
224256
}
225257
}
226258

259+
#[derive(FromZeros)]
260+
#[repr(C, align(4096))]
261+
pub struct PTable([u64; 512]);
262+
263+
impl PTable {
264+
pub fn array_mut(&mut self) -> &mut [u64; 512] {
265+
&mut self.0
266+
}
267+
}
268+
227269
/// The smallest basic page type.
228270
#[derive(FromZeros)]
229271
#[repr(C, align(4096))]

x86_64/src/l.S

Lines changed: 39 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,12 @@
3939
.set KCPUZERO, 0xffffff0000000000
4040
.set KMACH, (1*MiB)
4141
.set KTOFF, (2*MiB)
42+
.set KLOSTK, 0x8000
4243

43-
.set MACHSTKSZ, (16*PGSZ)
44-
.set MACHSTKOFF, (16*PGSZ)
45-
.set MACHGDTSZ, (16*PGSZ)
46-
.set MACHGDTOFF, (32*PGSZ)
44+
.set MACHSTKSZ, (64*KiB)
45+
.set MACHSTKOFF, (64*KiB)
46+
.set MACHGDTSZ, (64*KiB)
47+
.set MACHGDTOFF, (192*KiB)
4748

4849
.set Cr0PE, (1<<0) // Protected Mode Enable
4950
.set Cr0MP, (1<<1) // Monitor Coprocessor
@@ -56,6 +57,7 @@
5657
.set Cr4PSE, (1<<4) // Page-Size Extensions
5758
.set Cr4PAE, (1<<5) // Physical Address Extension
5859
.set Cr4PGE, (1<<7) // Page-Global Enable
60+
.set Cr4FSGSBASE, (1<<16) // (RD|WR)(FS|GS)BASE Extension
5961

6062
.set IA32_EFER, 0xc0000080 // Extended Feature Enable
6163

@@ -156,7 +158,7 @@ start:
156158
// off the page size extension; it is always eanbled in PAE mode.
157159
movl %cr4, %eax
158160
andl $~Cr4PSE, %eax
159-
orl $(Cr4PGE|Cr4PAE), %eax
161+
orl $(Cr4PGE|Cr4PAE|Cr4FSGSBASE), %eax
160162
movl %eax, %cr4
161163

162164
// Set up the Extended Feature Enable (EFER) MSR, enabling
@@ -203,19 +205,26 @@ start:
203205
movw %ax, %gs
204206

205207
// We can now use linked addresses for the stack and code.
206-
// We'll jump into the kernel from here.
207-
movabsq $(KCPUZERO+KMACH), %rdi
208-
leaq MACHSTKOFF+MACHSTKSZ(%rdi), %rsp
208+
// Jump to the code that initialized ccNUMA node 0 and the
209+
// cpu0 Mach, returning to warp64, which removes the
210+
// identity map, sets the SP to the scheduler stack, and
211+
// jumps
212+
leaq KMACH, %rdi
213+
leaq KLOSTK, %rsp
209214
movabsq $warp64, %rax
210215
pushq %rax
216+
movabsq $init0, %rax
217+
pushq %rax
211218
ret
212219

213220
warp64:
221+
// Now load the stack pointer with our Mach stack.
222+
movabsq $(KCPUZERO+KMACH), %rdi
223+
leaq MACHSTKOFF+MACHSTKSZ(%rdi), %rsp
214224
// At this point, we are fully in the kernel virtual
215225
// address space and can discard the identity mapping.
216226
// To do so, we load the early PML4 that does not have
217227
// that entry.
218-
movq $(earlypml4 - KZERO), %rax
219228
movq %rax, %cr3 // Also flushes TLB.
220229

221230
// A pointer to the Mach is already in %rdi, and is the
@@ -369,7 +378,9 @@ b1982:
369378
// Enable and activate Long Mode.
370379
movl %cr4, %eax
371380
andl $~Cr4PSE, %eax // PSE always true in long mode
372-
orl $(Cr4PGE|Cr4PAE), %eax // Page Global, Phys. Address Ext
381+
// Enable page global, physical address extension, and FS/GS base
382+
// load/store instruction support.
383+
orl $(Cr4PGE|Cr4PAE|Cr4FSGSBASE), %eax
373384
movl %eax, %cr4
374385

375386
movl $IA32_EFER, %ecx // Extended Feature Enable
@@ -386,28 +397,27 @@ b1982:
386397

387398
.code64
388399
1:
389-
leaq APMACH, %rdi
390-
movq %rdi, %rsp
391-
movabsq $apwarp64, %rax
392-
pushq %rax
393-
ret
394-
ud2
395-
396-
apwarp64:
397400
movabsq $gdtdescv, %rax
398401
lgdt (%rax)
399402
pushq $GdtCODE64
400403
pushq $(1f-KZERO)
401404
lretq
402405
1:
403-
404406
xorl %eax, %eax
405407
movw %ax, %ds
406408
movw %ax, %es
407409
movw %ax, %fs
408410
movw %ax, %gs
409411
movw %ax, %ss
410412

413+
leaq APMACH, %rdi
414+
movq %rdi, %rsp
415+
movabsq $apwarp64, %rax
416+
pushq %rax
417+
ret
418+
ud2
419+
420+
apwarp64:
411421
// Move to the real PML4 from our Mach.
412422
movq (%rdi), %rax
413423
movq %rax, %cr3
@@ -419,7 +429,7 @@ apwarp64:
419429

420430
pushq $0
421431
xorl %ebp, %ebp
422-
movq 8(%rdi), %rax // m->scratch
432+
movq 128*KiB+8(%rdi), %rax // m->scratch
423433
pushq %rax
424434
pushq $2 // Clear flags
425435
popfq
@@ -432,13 +442,8 @@ e1978:
432442
bootpml4:
433443
.quad bootidentitypt3 - KZERO + (PteRW | PteP)
434444
.space 4096 - 3*8
435-
.quad earlymachpt3 - KZERO + (PteRW | PteP)
436-
.quad earlykernpt3 - KZERO + (PteRW | PteP)
437-
438-
earlypml4:
439-
.space 4096 - 2*8
440-
.quad earlymachpt3 - KZERO + (PteRW | PteP)
441-
.quad earlykernpt3 - KZERO + (PteRW | PteP)
445+
.quad bootmachpt3 - KZERO + (PteRW | PteP)
446+
.quad bootkernpt3 - KZERO + (PteRW | PteP)
442447

443448
bootidentitypt3:
444449
.quad (0<<30) + (PtePS | PteRW | PteP)
@@ -447,11 +452,15 @@ bootidentitypt3:
447452
.quad (3<<30) + (PtePS | PteRW | PteP)
448453
.space 4096 - 4*8
449454

450-
earlymachpt3:
455+
bootmachpt3:
456+
// The CPU part
457+
.quad (0<<30) + (PtePS | PteRW | PteP)
458+
.space 4096/2 - 1*8
459+
// The node part
451460
.quad (0<<30) + (PtePS | PteRW | PteP)
452-
.space 4096 - 1*8
461+
.space 4096/2 - 1*8
453462

454-
earlykernpt3:
463+
bootkernpt3:
455464
.space 4096 - 2*8
456465
.quad (0<<30) + (PtePS | PteRW | PteP)
457466
.quad 0

x86_64/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ mod allocator;
1010
mod cpu;
1111
mod dat;
1212
mod devcons;
13+
mod node0;
1314
mod pio;
1415
mod proc;
1516
mod syscall;

0 commit comments

Comments
 (0)