blob: ec678aafa3f866bdd44b011e1e8b80de0ff3106a [file] [log] [blame]
#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
#include <asm/kaiser.h>
#include <asm/tlbflush.h> /* to verify its kaiser declarations */
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/desc.h>
#include <asm/cmdline.h>
#include <asm/vsyscall.h>
int kaiser_enabled __read_mostly = 1;
EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
__visible
DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
/*
* These can have bit 63 set, so we can not just use a plain "or"
* instruction to get their value or'd into CR3. It would take
* another register. So, we use a memory reference to these instead.
*
* This is also handy because systems that do not support PCIDs
* just end up or'ing a 0 into their CR3, which does no harm.
*/
DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
/*
* At runtime, the only things we map are some things for CPU
* hotplug, and stacks for new processes. No two CPUs will ever
* be populating the same addresses, so we only need to ensure
* that we protect between two CPUs trying to allocate and
* populate the same page table page.
*
* Only take this lock when doing a set_p[4um]d(), but it is not
* needed for doing a set_pte(). We assume that only the *owner*
* of a given allocation will be doing this for _their_
* allocation.
*
* This ensures that once a system has been running for a while
* and there have been stacks all over and these page tables
* are fully populated, there will be no further acquisitions of
* this lock.
*/
static DEFINE_SPINLOCK(shadow_table_allocation_lock);
/*
* Returns -1 on error.
*/
static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pgd = pgd_offset_k(vaddr);
/*
* We made all the kernel PGDs present in kaiser_init().
* We expect them to stay that way.
*/
BUG_ON(pgd_none(*pgd));
/*
* PGDs are either 512GB or 128TB on all x86_64
* configurations. We don't handle these.
*/
BUG_ON(pgd_large(*pgd));
pud = pud_offset(pgd, vaddr);
if (pud_none(*pud)) {
WARN_ON_ONCE(1);
return -1;
}
if (pud_large(*pud))
return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
pmd = pmd_offset(pud, vaddr);
if (pmd_none(*pmd)) {
WARN_ON_ONCE(1);
return -1;
}
if (pmd_large(*pmd))
return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
pte = pte_offset_kernel(pmd, vaddr);
if (pte_none(*pte)) {
WARN_ON_ONCE(1);
return -1;
}
return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
}
/*
* This is a relatively normal page table walk, except that it
* also tries to allocate page tables pages along the way.
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
static pte_t *kaiser_pagetable_walk(unsigned long address, bool user)
{
pmd_t *pmd;
pud_t *pud;
pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
unsigned long prot = _KERNPG_TABLE;
if (pgd_none(*pgd)) {
WARN_ONCE(1, "All shadow pgds should have been populated");
return NULL;
}
BUILD_BUG_ON(pgd_large(*pgd) != 0);
if (user) {
/*
* The vsyscall page is the only page that will have
* _PAGE_USER set. Catch everything else.
*/
BUG_ON(address != VSYSCALL_ADDR);
set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
prot = _PAGE_TABLE;
}
pud = pud_offset(pgd, address);
/* The shadow page tables do not use large mappings: */
if (pud_large(*pud)) {
WARN_ON(1);
return NULL;
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
if (!new_pmd_page)
return NULL;
spin_lock(&shadow_table_allocation_lock);
if (pud_none(*pud)) {
set_pud(pud, __pud(prot | __pa(new_pmd_page)));
__inc_zone_page_state(virt_to_page((void *)
new_pmd_page), NR_KAISERTABLE);
} else
free_page(new_pmd_page);
spin_unlock(&shadow_table_allocation_lock);
}
pmd = pmd_offset(pud, address);
/* The shadow page tables do not use large mappings: */
if (pmd_large(*pmd)) {
WARN_ON(1);
return NULL;
}
if (pmd_none(*pmd)) {
unsigned long new_pte_page = __get_free_page(gfp);
if (!new_pte_page)
return NULL;
spin_lock(&shadow_table_allocation_lock);
if (pmd_none(*pmd)) {
set_pmd(pmd, __pmd(prot | __pa(new_pte_page)));
__inc_zone_page_state(virt_to_page((void *)
new_pte_page), NR_KAISERTABLE);
} else
free_page(new_pte_page);
spin_unlock(&shadow_table_allocation_lock);
}
return pte_offset_kernel(pmd, address);
}
static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
unsigned long flags)
{
int ret = 0;
pte_t *pte;
unsigned long start_addr = (unsigned long )__start_addr;
unsigned long address = start_addr & PAGE_MASK;
unsigned long end_addr = PAGE_ALIGN(start_addr + size);
unsigned long target_address;
/*
* It is convenient for callers to pass in __PAGE_KERNEL etc,
* and there is no actual harm from setting _PAGE_GLOBAL, so
* long as CR4.PGE is not set. But it is nonetheless troubling
* to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
* requires that not to be #defined to 0): so mask it off here.
*/
flags &= ~_PAGE_GLOBAL;
if (!(__supported_pte_mask & _PAGE_NX))
flags &= ~_PAGE_NX;
for (; address < end_addr; address += PAGE_SIZE) {
target_address = get_pa_from_mapping(address);
if (target_address == -1) {
ret = -EIO;
break;
}
pte = kaiser_pagetable_walk(address, flags & _PAGE_USER);
if (!pte) {
ret = -ENOMEM;
break;
}
if (pte_none(*pte)) {
set_pte(pte, __pte(flags | target_address));
} else {
pte_t tmp;
set_pte(&tmp, __pte(flags | target_address));
WARN_ON_ONCE(!pte_same(*pte, tmp));
}
}
return ret;
}
static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
{
unsigned long size = end - start;
return kaiser_add_user_map(start, size, flags);
}
/*
* Ensure that the top level of the (shadow) page tables are
* entirely populated. This ensures that all processes that get
* forked have the same entries. This way, we do not have to
* ever go set up new entries in older processes.
*
* Note: we never free these, so there are no updates to them
* after this.
*/
static void __init kaiser_init_all_pgds(void)
{
pgd_t *pgd;
int i = 0;
pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
pgd_t new_pgd;
pud_t *pud = pud_alloc_one(&init_mm,
PAGE_OFFSET + i * PGDIR_SIZE);
if (!pud) {
WARN_ON(1);
break;
}
inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
/*
* Make sure not to stomp on some other pgd entry.
*/
if (!pgd_none(pgd[i])) {
WARN_ON(1);
continue;
}
set_pgd(pgd + i, new_pgd);
}
}
#define kaiser_add_user_map_early(start, size, flags) do { \
int __ret = kaiser_add_user_map(start, size, flags); \
WARN_ON(__ret); \
} while (0)
#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
WARN_ON(__ret); \
} while (0)
void __init kaiser_check_boottime_disable(void)
{
bool enable = true;
char arg[5];
int ret;
if (boot_cpu_has(X86_FEATURE_XENPV))
goto silent_disable;
ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
if (ret > 0) {
if (!strncmp(arg, "on", 2))
goto enable;
if (!strncmp(arg, "off", 3))
goto disable;
if (!strncmp(arg, "auto", 4))
goto skip;
}
if (cmdline_find_option_bool(boot_command_line, "nopti"))
goto disable;
skip:
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
goto disable;
enable:
if (enable)
setup_force_cpu_cap(X86_FEATURE_KAISER);
return;
disable:
pr_info("disabled\n");
silent_disable:
kaiser_enabled = 0;
setup_clear_cpu_cap(X86_FEATURE_KAISER);
}
/*
* If anything in here fails, we will likely die on one of the
* first kernel->user transitions and init will die. But, we
* will have most of the kernel up by then and should be able to
* get a clean warning out of it. If we BUG_ON() here, we run
* the risk of being before we have good console output.
*/
void __init kaiser_init(void)
{
int cpu;
if (!kaiser_enabled)
return;
kaiser_init_all_pgds();
/*
* Note that this sets _PAGE_USER and it needs to happen when the
* pagetable hierarchy gets created, i.e., early. Otherwise
* kaiser_pagetable_walk() will encounter initialized PTEs in the
* hierarchy and not set the proper permissions, leading to the
* pagefaults with page-protection violations when trying to read the
* vsyscall page. For example.
*/
if (vsyscall_enabled())
kaiser_add_user_map_early((void *)VSYSCALL_ADDR,
PAGE_SIZE,
vsyscall_pgprot);
for_each_possible_cpu(cpu) {
void *percpu_vaddr = __per_cpu_user_mapped_start +
per_cpu_offset(cpu);
unsigned long percpu_sz = __per_cpu_user_mapped_end -
__per_cpu_user_mapped_start;
kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
__PAGE_KERNEL);
}
/*
* Map the entry/exit text section, which is needed at
* switches from user to and from kernel.
*/
kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
__PAGE_KERNEL_RX);
#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
kaiser_add_user_map_ptrs_early(__irqentry_text_start,
__irqentry_text_end,
__PAGE_KERNEL_RX);
#endif
kaiser_add_user_map_early((void *)idt_descr.address,
sizeof(gate_desc) * NR_VECTORS,
__PAGE_KERNEL_RO);
#ifdef CONFIG_TRACING
kaiser_add_user_map_early(&trace_idt_descr,
sizeof(trace_idt_descr),
__PAGE_KERNEL);
kaiser_add_user_map_early(&trace_idt_table,
sizeof(gate_desc) * NR_VECTORS,
__PAGE_KERNEL);
#endif
kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
__PAGE_KERNEL);
kaiser_add_user_map_early(&debug_idt_table,
sizeof(gate_desc) * NR_VECTORS,
__PAGE_KERNEL);
pr_info("enabled\n");
}
/* Add a mapping to the shadow mapping, and synchronize the mappings */
int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
{
if (!kaiser_enabled)
return 0;
return kaiser_add_user_map((const void *)addr, size, flags);
}
void kaiser_remove_mapping(unsigned long start, unsigned long size)
{
extern void unmap_pud_range_nofree(pgd_t *pgd,
unsigned long start, unsigned long end);
unsigned long end = start + size;
unsigned long addr, next;
pgd_t *pgd;
if (!kaiser_enabled)
return;
pgd = native_get_shadow_pgd(pgd_offset_k(start));
for (addr = start; addr < end; pgd++, addr = next) {
next = pgd_addr_end(addr, end);
unmap_pud_range_nofree(pgd, addr, next);
}
}
/*
* Page table pages are page-aligned. The lower half of the top
* level is used for userspace and the top half for the kernel.
* This returns true for user pages that need to get copied into
* both the user and kernel copies of the page tables, and false
* for kernel pages that should only be in the kernel copy.
*/
static inline bool is_userspace_pgd(pgd_t *pgdp)
{
return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
}
pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
{
if (!kaiser_enabled)
return pgd;
/*
* Do we need to also populate the shadow pgd? Check _PAGE_USER to
* skip cases like kexec and EFI which make temporary low mappings.
*/
if (pgd.pgd & _PAGE_USER) {
if (is_userspace_pgd(pgdp)) {
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
/*
* Even if the entry is *mapping* userspace, ensure
* that userspace can not use it. This way, if we
* get out to userspace running on the kernel CR3,
* userspace will crash instead of running.
*/
if (__supported_pte_mask & _PAGE_NX)
pgd.pgd |= _PAGE_NX;
}
} else if (!pgd.pgd) {
/*
* pgd_clear() cannot check _PAGE_USER, and is even used to
* clear corrupted pgd entries: so just rely on cases like
* kexec and EFI never to be using pgd_clear().
*/
if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
is_userspace_pgd(pgdp))
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
}
return pgd;
}
void kaiser_setup_pcid(void)
{
unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
if (this_cpu_has(X86_FEATURE_PCID))
user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
/*
* These variables are used by the entry/exit
* code to change PCID and pgd and TLB flushing.
*/
this_cpu_write(x86_cr3_pcid_user, user_cr3);
}
/*
* Make a note that this cpu will need to flush USER tlb on return to user.
* If cpu does not have PCID, then the NOFLUSH bit will never have been set.
*/
void kaiser_flush_tlb_on_return_to_user(void)
{
if (this_cpu_has(X86_FEATURE_PCID))
this_cpu_write(x86_cr3_pcid_user,
X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
}
EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);