Commit e3089a03 authored by kib's avatar kib
Browse files

i386 4/4G split.

The change makes the user and kernel address spaces on i386
independent, giving each almost the full 4G of usable virtual addresses
except for one PDE at top used for trampoline and per-CPU trampoline
stacks, and system structures that must be always mapped, namely IDT,
GDT, common TSS and LDT, and process-private TSS and LDT if allocated.

By using 1:1 mapping for the kernel text and data, it appeared
possible to eliminate assembler part of the locore.S which bootstraps
initial page table and KPTmap.  The code is rewritten in C and moved
into the pmap_cold(). The comment in vmparam.h explains the KVA
layout.

There is no PCID mechanism available in protected mode, so each
kernel/user switch forth and back completely flushes the TLB, except
for the trampoline PTD region. The TLB invalidations for userspace
becomes trivial, because IPI handlers switch page tables. On the other
hand, context switches no longer need to reload %cr3.

copyout(9) was rewritten to use vm_fault_quick_hold().  An issue for
new copyout(9) is compatibility with wiring user buffers around sysctl
handlers. This explains two kind of locks for copyout ptes and
accounting of the vslock() calls.  The vm_fault_quick_hold() AKA slow
path, is only tried after the 'fast path' failed, which temporary
changes mapping to the userspace and copies the data to/from small
per-cpu buffer in the trampoline.  If a page fault occurs during the
copy, it is short-circuit by exception.s to not even reach C code.

The change was motivated by the need to implement the Meltdown
mitigation, but instead of KPTI the full split is done.  The i386
architecture already shows the sizing problems, in particular, it is
impossible to link clang and lld with debugging.  I expect that the
issues due to the virtual address space limits would only exaggerate
and the split gives more liveness to the platform.

Tested by: pho
Discussed with:	bde
Sponsored by:	The FreeBSD Foundation
MFC after:	1 month
Differential revision:	https://reviews.freebsd.org/D14633
parent 02e56ea3
......@@ -29,6 +29,8 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/proc.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/pcb.h>
#include <machine/frame.h>
#include <machine/segments.h>
......@@ -279,12 +281,26 @@ kgdb_trgt_frame_cache(struct frame_info *next_frame, void **this_cache)
char buf[MAX_REGISTER_SIZE];
struct kgdb_frame_cache *cache;
char *pname;
CORE_ADDR pcx;
uintptr_t addr, setidt_disp;
cache = *this_cache;
if (cache == NULL) {
cache = FRAME_OBSTACK_ZALLOC(struct kgdb_frame_cache);
*this_cache = cache;
cache->pc = frame_func_unwind(next_frame);
pcx = frame_pc_unwind(next_frame);
if (pcx >= PMAP_TRM_MIN_ADDRESS) {
addr = kgdb_lookup("setidt_disp");
if (addr != 0) {
if (kvm_read(kvm, addr, &setidt_disp,
sizeof(setidt_disp)) !=
sizeof(setidt_disp))
warnx("kvm_read: %s", kvm_geterr(kvm));
else
pcx -= setidt_disp;
}
}
cache->pc = pcx;
find_pc_partial_function(cache->pc, &pname, NULL, NULL);
if (pname[0] != 'X')
cache->frame_type = FT_NORMAL;
......@@ -373,6 +389,8 @@ kgdb_trgt_trapframe_sniffer(struct frame_info *next_frame)
CORE_ADDR pc;
pc = frame_pc_unwind(next_frame);
if (pc >= PMAP_TRM_MIN_ADDRESS)
return (&kgdb_trgt_trapframe_unwind);
pname = NULL;
find_pc_partial_function(pc, &pname, NULL, NULL);
if (pname == NULL)
......
......@@ -483,6 +483,7 @@ i386/i386/atomic.c standard \
i386/i386/bios.c standard
i386/i386/bioscall.s standard
i386/i386/bpf_jit_machdep.c optional bpf_jitter
i386/i386/copyout.c standard
i386/i386/db_disasm.c optional ddb
i386/i386/db_interface.c optional ddb
i386/i386/db_trace.c optional ddb
......
......@@ -6,7 +6,7 @@ SEARCH_DIR(/usr/lib);
SECTIONS
{
/* Read-only sections, merged into text segment: */
. = kernbase + kernload + SIZEOF_HEADERS;
. = kernbase + SIZEOF_HEADERS;
.interp : { *(.interp) }
.hash : { *(.hash) }
.gnu.hash : { *(.gnu.hash) }
......
......@@ -109,7 +109,11 @@ dcons_crom_expose_idt(struct dcons_crom_softc *sc)
static off_t idt_paddr;
/* XXX */
#ifdef __amd64__
idt_paddr = (char *)idt - (char *)KERNBASE;
#else /* __i386__ */
idt_paddr = (off_t)pmap_kextract((vm_offset_t)idt);
#endif
crom_add_entry(&sc->unit, DCONS_CSR_KEY_RESET_HI, ADDR_HI(idt_paddr));
crom_add_entry(&sc->unit, DCONS_CSR_KEY_RESET_LO, ADDR_LO(idt_paddr));
......
......@@ -309,11 +309,16 @@ dcons_drv_init(int stage)
* Allow read/write access to dcons buffer.
*/
for (pa = trunc_page(addr); pa < addr + size; pa += PAGE_SIZE)
*vtopte(KERNBASE + pa) |= PG_RW;
*vtopte(PMAP_MAP_LOW + pa) |= PG_RW;
invltlb();
#endif
/* XXX P to V */
#ifdef __amd64__
dg.buf = (struct dcons_buf *)(vm_offset_t)(KERNBASE + addr);
#else /* __i386__ */
dg.buf = (struct dcons_buf *)((vm_offset_t)PMAP_MAP_LOW +
addr);
#endif
dg.size = size;
if (dcons_load_buffer(dg.buf, dg.size, sc) < 0)
dg.buf = NULL;
......
......@@ -26,11 +26,12 @@
* $FreeBSD$
*/
#include "assym.inc"
#include <machine/psl.h>
#include <machine/asmacros.h>
#include <machine/specialreg.h>
#include "assym.inc"
/*
* This is the Hyper-V vmbus channel direct callback interrupt.
* Only used when it is running on Hyper-V.
......@@ -42,6 +43,7 @@ IDTVEC(vmbus_isr)
PUSH_FRAME
SET_KERNEL_SREGS
cld
KENTER
FAKE_MCOUNT(TF_EIP(%esp))
pushl %esp
call vmbus_handle_intr
......
......@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/vmparam.h>
#include <machine/pc/bios.h>
#endif
#include <dev/ppbus/ppbconf.h>
......@@ -121,7 +122,7 @@ static char *ppc_epp_protocol[] = { " (EPP 1.9)", " (EPP 1.7)", 0 };
* BIOS printer list - used by BIOS probe.
*/
#define BIOS_PPC_PORTS 0x408
#define BIOS_PORTS (short *)(KERNBASE+BIOS_PPC_PORTS)
#define BIOS_PORTS ((short *)BIOS_PADDRTOVADDR(BIOS_PPC_PORTS))
#define BIOS_MAX_PPC 4
#endif
......
......@@ -288,7 +288,11 @@ ec_putc(int c)
* This is enough for ec_putc() to work very early on x86
* if the kernel starts in normal color text mode.
*/
#ifdef __amd64__
fb = KERNBASE + 0xb8000;
#else /* __i386__ */
fb = PMAP_MAP_LOW + 0xb8000;
#endif
xsize = 80;
ysize = 25;
#endif
......
......@@ -894,19 +894,6 @@ options ENABLE_ALART # Control alarm on Intel intpm driver
#
options PMAP_SHPGPERPROC=201
#
# Change the size of the kernel virtual address space. Due to
# constraints in loader(8) on i386, this must be a multiple of 4.
# 256 = 1 GB of kernel address space. Increasing this also causes
# a reduction of the address space in user processes. 512 splits
# the 4GB cpu address space in half (2GB user, 2GB kernel). For PAE
# kernels, the value will need to be double non-PAE. A value of 1024
# for PAE kernels is necessary to split the address space in half.
# This will likely need to be increased to handle memory sizes >4GB.
# PAE kernels default to a value of 512.
#
options KVA_PAGES=260
#
# Number of initial kernel page table pages used for early bootstrap.
# This number should include enough pages to map the kernel, any
......@@ -951,22 +938,6 @@ device ndis
#####################################################################
# VM OPTIONS
# Disable the 4 MByte page PSE CPU feature. The PSE feature allows the
# kernel to use 4 MByte pages to map the kernel instead of 4k pages.
# This saves on the amount of memory needed for page tables needed to
# map the kernel. You should only disable this feature as a temporary
# workaround if you are having problems with it enabled.
#
#options DISABLE_PSE
# Disable the global pages PGE CPU feature. The PGE feature allows pages
# to be marked with the PG_G bit. TLB entries for these pages are not
# flushed from the cache when %cr3 is reloaded. This can make context
# switches less expensive. You should only disable this feature as a
# temporary workaround if you are having problems with it enabled.
#
#options DISABLE_PG_G
# KSTACK_PAGES is the number of memory pages to assign to the kernel
# stack of each thread.
......
......@@ -39,6 +39,7 @@
#include "opt_smp.h"
#include <machine/asmacros.h>
#include <machine/psl.h>
#include <machine/specialreg.h>
#include <x86/apicreg.h>
......@@ -67,34 +68,39 @@ as_lapic_eoi:
* translates that into a vector, and passes the vector to the
* lapic_handle_intr() function.
*/
#define ISR_VEC(index, vec_name) \
.text ; \
SUPERALIGN_TEXT ; \
IDTVEC(vec_name ## _pti) ; \
IDTVEC(vec_name) ; \
PUSH_FRAME ; \
SET_KERNEL_SREGS ; \
cld ; \
FAKE_MCOUNT(TF_EIP(%esp)) ; \
cmpl $0,x2apic_mode ; \
je 1f ; \
movl $(MSR_APIC_ISR0 + index),%ecx ; \
rdmsr ; \
jmp 2f ; \
1: ; \
movl lapic_map, %edx ;/* pointer to local APIC */ \
movl LA_ISR + 16 * (index)(%edx), %eax ; /* load ISR */ \
2: ; \
bsrl %eax, %eax ; /* index of highest set bit in ISR */ \
jz 3f ; \
addl $(32 * index),%eax ; \
pushl %esp ; \
pushl %eax ; /* pass the IRQ */ \
call lapic_handle_intr ; \
addl $8, %esp ; /* discard parameter */ \
3: ; \
MEXITCOUNT ; \
.macro ISR_VEC index, vec_name
.text
SUPERALIGN_TEXT
.globl X\()\vec_name\()_pti, X\()\vec_name
X\()\vec_name\()_pti:
X\()\vec_name:
PUSH_FRAME
SET_KERNEL_SREGS
cld
KENTER
FAKE_MCOUNT(TF_EIP(%esp))
cmpl $0,x2apic_mode
je 2f
movl $(MSR_APIC_ISR0 + \index),%ecx
rdmsr
jmp 3f
2:
movl lapic_map, %edx /* pointer to local APIC */
movl LA_ISR + 16 * \index(%edx), %eax /* load ISR */
3:
bsrl %eax, %eax /* index of highest set bit in ISR */
jz 4f
addl $(32 * \index),%eax
pushl %esp
pushl %eax /* pass the IRQ */
movl $lapic_handle_intr, %eax
call *%eax
addl $8, %esp /* discard parameter */
4:
MEXITCOUNT
jmp doreti
.endm
/*
* Handle "spurious INTerrupts".
......@@ -111,13 +117,13 @@ IDTVEC(spuriousint)
iret
ISR_VEC(1, apic_isr1)
ISR_VEC(2, apic_isr2)
ISR_VEC(3, apic_isr3)
ISR_VEC(4, apic_isr4)
ISR_VEC(5, apic_isr5)
ISR_VEC(6, apic_isr6)
ISR_VEC(7, apic_isr7)
ISR_VEC 1, apic_isr1
ISR_VEC 2, apic_isr2
ISR_VEC 3, apic_isr3
ISR_VEC 4, apic_isr4
ISR_VEC 5, apic_isr5
ISR_VEC 6, apic_isr6
ISR_VEC 7, apic_isr7
/*
* Local APIC periodic timer handler.
......@@ -129,9 +135,11 @@ IDTVEC(timerint)
PUSH_FRAME
SET_KERNEL_SREGS
cld
KENTER
FAKE_MCOUNT(TF_EIP(%esp))
pushl %esp
call lapic_handle_timer
movl $lapic_handle_timer, %eax
call *%eax
add $4, %esp
MEXITCOUNT
jmp doreti
......@@ -146,8 +154,10 @@ IDTVEC(cmcint)
PUSH_FRAME
SET_KERNEL_SREGS
cld
KENTER
FAKE_MCOUNT(TF_EIP(%esp))
call lapic_handle_cmc
movl $lapic_handle_cmc, %eax
call *%eax
MEXITCOUNT
jmp doreti
......@@ -161,8 +171,10 @@ IDTVEC(errorint)
PUSH_FRAME
SET_KERNEL_SREGS
cld
KENTER
FAKE_MCOUNT(TF_EIP(%esp))
call lapic_handle_error
movl $lapic_handle_error, %eax
call *%eax
MEXITCOUNT
jmp doreti
......@@ -177,9 +189,11 @@ IDTVEC(xen_intr_upcall)
PUSH_FRAME
SET_KERNEL_SREGS
cld
KENTER
FAKE_MCOUNT(TF_EIP(%esp))
pushl %esp
call xen_intr_handle_upcall
movl $xen_intr_handle_upcall, %eax
call *%eax
add $4, %esp
MEXITCOUNT
jmp doreti
......@@ -200,9 +214,9 @@ IDTVEC(invltlb)
PUSH_FRAME
SET_KERNEL_SREGS
cld
call invltlb_handler
KENTER
movl $invltlb_handler, %eax
call *%eax
jmp invltlb_ret
/*
......@@ -214,9 +228,9 @@ IDTVEC(invlpg)
PUSH_FRAME
SET_KERNEL_SREGS
cld
call invlpg_handler
KENTER
movl $invlpg_handler, %eax
call *%eax
jmp invltlb_ret
/*
......@@ -228,9 +242,9 @@ IDTVEC(invlrng)
PUSH_FRAME
SET_KERNEL_SREGS
cld
call invlrng_handler
KENTER
movl $invlrng_handler, %eax
call *%eax
jmp invltlb_ret
/*
......@@ -242,9 +256,9 @@ IDTVEC(invlcache)
PUSH_FRAME
SET_KERNEL_SREGS
cld
call invlcache_handler
KENTER
movl $invlcache_handler, %eax
call *%eax
jmp invltlb_ret
/*
......@@ -256,12 +270,11 @@ IDTVEC(ipi_intr_bitmap_handler)
PUSH_FRAME
SET_KERNEL_SREGS
cld
KENTER
call as_lapic_eoi
FAKE_MCOUNT(TF_EIP(%esp))
call ipi_bitmap_handler
movl $ipi_bitmap_handler, %eax
call *%eax
MEXITCOUNT
jmp doreti
......@@ -274,9 +287,10 @@ IDTVEC(cpustop)
PUSH_FRAME
SET_KERNEL_SREGS
cld
KENTER
call as_lapic_eoi
call cpustop_handler
movl $cpustop_handler, %eax
call *%eax
jmp doreti
/*
......@@ -288,9 +302,10 @@ IDTVEC(cpususpend)
PUSH_FRAME
SET_KERNEL_SREGS
cld
KENTER
call as_lapic_eoi
call cpususpend_handler
movl $cpususpend_handler, %eax
call *%eax
jmp doreti
/*
......@@ -304,14 +319,14 @@ IDTVEC(rendezvous)
PUSH_FRAME
SET_KERNEL_SREGS
cld
KENTER
#ifdef COUNT_IPIS
movl PCPU(CPUID), %eax
movl ipi_rendezvous_counts(,%eax,4), %eax
incl (%eax)
#endif
call smp_rendezvous_action
movl $smp_rendezvous_action, %eax
call *%eax
call as_lapic_eoi
jmp doreti
......
......@@ -36,6 +36,7 @@
* master and slave interrupt controllers.
*/
#include <machine/psl.h>
#include <machine/asmacros.h>
#include "assym.inc"
......@@ -43,37 +44,41 @@
/*
* Macros for interrupt entry, call to handler, and exit.
*/
#define INTR(irq_num, vec_name) \
.text ; \
SUPERALIGN_TEXT ; \
IDTVEC(vec_name ##_pti) ; \
IDTVEC(vec_name) ; \
PUSH_FRAME ; \
SET_KERNEL_SREGS ; \
cld ; \
; \
FAKE_MCOUNT(TF_EIP(%esp)) ; \
pushl %esp ; \
pushl $irq_num; /* pass the IRQ */ \
call atpic_handle_intr ; \
addl $8, %esp ; /* discard the parameters */ \
; \
MEXITCOUNT ; \
.macro INTR irq_num, vec_name
.text
SUPERALIGN_TEXT
.globl X\()\vec_name\()_pti, X\()\vec_name
X\()\vec_name\()_pti:
X\()\vec_name:
PUSH_FRAME
SET_KERNEL_SREGS
cld
KENTER
FAKE_MCOUNT(TF_EIP(%esp))
pushl %esp
pushl $\irq_num /* pass the IRQ */
movl $atpic_handle_intr, %eax
call *%eax
addl $8, %esp /* discard the parameters */
MEXITCOUNT
jmp doreti
.endm
INTR(0, atpic_intr0)
INTR(1, atpic_intr1)
INTR(2, atpic_intr2)
INTR(3, atpic_intr3)
INTR(4, atpic_intr4)
INTR(5, atpic_intr5)
INTR(6, atpic_intr6)
INTR(7, atpic_intr7)
INTR(8, atpic_intr8)
INTR(9, atpic_intr9)
INTR(10, atpic_intr10)
INTR(11, atpic_intr11)
INTR(12, atpic_intr12)
INTR(13, atpic_intr13)
INTR(14, atpic_intr14)
INTR(15, atpic_intr15)
INTR 0, atpic_intr0
INTR 1, atpic_intr1
INTR 2, atpic_intr2
INTR 3, atpic_intr3
INTR 4, atpic_intr4
INTR 5, atpic_intr5
INTR 6, atpic_intr6
INTR 7, atpic_intr7
INTR 8, atpic_intr8
INTR 9, atpic_intr9
INTR 10, atpic_intr10
INTR 11, atpic_intr11
INTR 12, atpic_intr12
INTR 13, atpic_intr13
INTR 14, atpic_intr14
INTR 15, atpic_intr15
......@@ -305,6 +305,7 @@ set_bios_selectors(struct bios_segments *seg, int flags)
}
extern int vm86pa;
extern u_long vm86phystk;
extern void bios16_jmp(void);
/*
......@@ -329,7 +330,7 @@ bios16(struct bios_args *args, char *fmt, ...)
int flags = BIOSCODE_FLAG | BIOSDATA_FLAG;
u_int i, arg_start, arg_end;
pt_entry_t *pte;
pd_entry_t *ptd;
pd_entry_t *ptd, orig_ptd;
arg_start = 0xffffffff;
arg_end = 0;
......@@ -390,27 +391,14 @@ bios16(struct bios_args *args, char *fmt, ...)
args->seg.code32.base = (u_int)&bios16_jmp & PG_FRAME;
args->seg.code32.limit = 0xffff;
ptd = (pd_entry_t *)rcr3();
#if defined(PAE) || defined(PAE_TABLES)
if (ptd == IdlePDPT)
#else
if (ptd == IdlePTD)
#endif
{
/*
* no page table, so create one and install it.
*/
pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
ptd = (pd_entry_t *)((u_int)IdlePTD + KERNBASE);
*pte = (vm86pa - PAGE_SIZE) | PG_RW | PG_V;
*ptd = vtophys(pte) | PG_RW | PG_V;
} else {
/*
* this is a user-level page table
*/
pte = PTmap;
*pte = (vm86pa - PAGE_SIZE) | PG_RW | PG_V;
}
/*
* no page table, so create one and install it.
*/
pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
ptd = IdlePTD;
*pte = vm86phystk | PG_RW | PG_V;
orig_ptd = *ptd;
*ptd = vtophys(pte) | PG_RW | PG_V;
pmap_invalidate_all(kernel_pmap); /* XXX insurance for now */
stack_top = stack;
......@@ -464,20 +452,12 @@ bios16(struct bios_args *args, char *fmt, ...)
i = bios16_call(&args->r, stack_top);
if (pte == PTmap) {
*pte = 0; /* remove entry */
/*
* XXX only needs to be invlpg(0) but that doesn't work on the 386
*/
pmap_invalidate_all(kernel_pmap);
} else {
*ptd = 0; /* remove page table */
/*
* XXX only needs to be invlpg(0) but that doesn't work on the 386
*/
pmap_invalidate_all(kernel_pmap);
free(pte, M_TEMP); /* ... and free it */
}
*ptd = orig_ptd; /* remove page table */
/*
* XXX only needs to be invlpg(0) but that doesn't work on the 386
*/
pmap_invalidate_all(kernel_pmap);
free(pte, M_TEMP); /* ... and free it */
return (i);
}
......
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2018 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed by Konstantin Belousov <kib@FreeBSD.org>
* under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND