--- a/vmmon/Makefile +++ b/vmmon/Makefile @@ -43,7 +43,11 @@ endif +ifdef KVERSION +VM_UNAME = $(KVERSION) +else VM_UNAME = $(shell uname -r) +endif # Header directory for the running kernel ifdef LINUXINCLUDE @@ -137,7 +141,6 @@ endif # Add Spectre options when available -CC_OPTS += $(call vm_check_gcc,-mindirect-branch=thunk -mindirect-branch-register,) include $(SRCROOT)/Makefile.kernel --- a/vmmon/linux/hostif.c +++ b/vmmon/linux/hostif.c @@ -47,6 +47,8 @@ #include #include #include +#include +#include #include #include #include @@ -54,6 +56,7 @@ #include #include #include // For linux/sched/signal.h without version check +#include #include "vmware.h" #include "x86apic.h" @@ -80,6 +83,13 @@ #error CONFIG_HIGH_RES_TIMERS required for acceptable performance #endif +/* task's state is read-once rather than volatile from 5.14-rc2. */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) || defined(get_current_state) +#define get_task_state(task) READ_ONCE((task)->__state) +#else +#define get_task_state(task) ((task)->state) +#endif + /* * Although this is not really related to kernel-compatibility, I put this * helper macro here for now for a lack of better place --hpreg @@ -463,7 +473,7 @@ ASSERT(vcpuid < vm->numVCPUs); t = vm->vmhost->vcpuSemaTask[vcpuid]; VCPUSet_Remove(&req, vcpuid); - if (t && (t->state & TASK_INTERRUPTIBLE)) { + if (t && (get_task_state(t) & TASK_INTERRUPTIBLE)) { wake_up_process(t); } } @@ -614,6 +617,15 @@ MutexUnlock(&fastClockMutex, callerID); } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) +static int crosspage_set_exec(pte_t *pte, unsigned long addr, void *data) +{ + struct page *p = data; + + set_pte(pte, mk_pte(p, VM_PAGE_KERNEL_EXEC)); + return 0; +} +#endif /* *---------------------------------------------------------------------- @@ -634,7 +646,29 @@ static void * MapCrossPage(struct page *p) // IN: { +#if COMPAT_LINUX_VERSION_CHECK_LT(5, 8, 0) return vmap(&p, 1, VM_MAP, VM_PAGE_KERNEL_EXEC); +#else + void *addr; + + addr = vmap(&p, 1, VM_MAP, VM_PAGE_KERNEL_EXEC); + if (!addr) + return NULL; + + /* Starting with 5.8, vmap() always sets the NX bit, but the cross + * page needs to be executable. */ + if (apply_to_page_range(current->mm, (unsigned long)addr, PAGE_SIZE, + crosspage_set_exec, p)) { + vunmap(addr); + return NULL; + } + + preempt_disable(); + __flush_tlb_all(); + preempt_enable(); + + return addr; +#endif } @@ -1739,7 +1773,10 @@ lockedPages += global_page_state(NR_PAGETABLE); #endif /* NR_SLAB_* moved from zone to node in 4.13. */ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) + lockedPages += global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) lockedPages += global_node_page_state(NR_SLAB_UNRECLAIMABLE); #else lockedPages += global_page_state(NR_SLAB_UNRECLAIMABLE); @@ -2389,16 +2426,22 @@ static Bool isVAReadable(VA r) // IN: { - mm_segment_t old_fs; uint32 dummy; int ret; +#ifdef HAVE_GET_KERNEL_NOFAULT + ret = get_kernel_nofault(dummy, (void *)r); +#else + { + mm_segment_t old_fs; + old_fs = get_fs(); set_fs(KERNEL_DS); r = APICR_TO_ADDR(r, APICR_VERSION); ret = HostIF_CopyFromUser(&dummy, r, sizeof dummy); set_fs(old_fs); - + } +#endif return ret == 0; } #endif @@ -2553,7 +2596,6 @@ uint64 *args) // IN: { struct file *file; - mm_segment_t old_fs; int res; int waitFD = args[0]; int timeoutms = args[2]; @@ -2566,22 +2610,19 @@ return MX_WAITERROR; } - old_fs = get_fs(); - set_fs(KERNEL_DS); - { struct poll_wqueues table; unsigned int mask; poll_initwait(&table); - current->state = TASK_INTERRUPTIBLE; + __set_current_state(TASK_INTERRUPTIBLE); mask = file->f_op->poll(file, &table.pt); if (!(mask & (POLLIN | POLLERR | POLLHUP))) { vm->vmhost->vcpuSemaTask[vcpuid] = current; schedule_timeout(timeoutms * HZ / 1000); // convert to Hz vm->vmhost->vcpuSemaTask[vcpuid] = NULL; } - current->state = TASK_RUNNING; + __set_current_state(TASK_RUNNING); poll_freewait(&table); } @@ -2590,9 +2631,11 @@ * the code to happily deal with a pipe or an eventfd. We only care about * reading no bytes (EAGAIN - non blocking fd) or sizeof(uint64). */ - - res = file->f_op->read(file, (char *) &value, sizeof value, &file->f_pos); - +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) + res = kernel_read(file, file->f_pos, (char *)&value, sizeof value); +#else + res = kernel_read(file, &value, sizeof value, &file->f_pos); +#endif if (res == sizeof value) { res = MX_WAITNORMAL; } else { @@ -2601,7 +2644,6 @@ } } - set_fs(old_fs); fput(file); /* @@ -2658,7 +2705,7 @@ */ struct task_struct *t = (struct task_struct *)xchg(&vm->vmhost->vcpuSemaTask[vcpuid], NULL); - if (t && (t->state & TASK_INTERRUPTIBLE)) { + if (t && (get_task_state(t) & TASK_INTERRUPTIBLE)) { wake_up_process(t); } } ROF_EACH_VCPU_IN_SET_WITH_MAX(); @@ -2688,8 +2730,8 @@ int HostIF_SemaphoreSignal(uint64 *args) // IN: { + struct eventfd_ctx *eventfd; struct file *file; - mm_segment_t old_fs; int res; int signalFD = args[1]; uint64 value = 1; // make an eventfd happy should it be there @@ -2699,22 +2741,32 @@ return MX_WAITERROR; } - old_fs = get_fs(); - set_fs(KERNEL_DS); + /* + * If it's eventfd, use specific eventfd interface as kernel writes + * to eventfd may not be allowed in kernel 5.10 and later. + */ + eventfd = eventfd_ctx_fileget(file); + if (!IS_ERR(eventfd)) { + eventfd_signal(eventfd, 1); + fput(file); + return MX_WAITNORMAL; + } /* * Always write sizeof(uint64) bytes. This works fine for eventfd and * pipes. The data written is formatted to make an eventfd happy should * it be present. */ - - res = file->f_op->write(file, (char *) &value, sizeof value, &file->f_pos); +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) + res = kernel_write(file, (char *)&value, sizeof value, file->f_pos); +#else + res = kernel_write(file, &value, sizeof value, &file->f_pos); +#endif if (res == sizeof value) { res = MX_WAITNORMAL; } - set_fs(old_fs); fput(file); /* @@ -3261,12 +3313,9 @@ HostIFFastClockThread(void *unused) // IN: { int res; - mm_segment_t oldFS; unsigned int rate = 0; unsigned int prevRate = 0; - oldFS = get_fs(); - set_fs(KERNEL_DS); allow_signal(SIGKILL); while ((rate = linuxState.fastClockRate) > MIN_RATE) { @@ -3289,8 +3338,6 @@ } out: - set_fs(oldFS); - /* * Do not exit thread until we are told to do so. */