--- a/vmmon/Makefile +++ b/vmmon/Makefile @@ -43,7 +43,11 @@ INCLUDE += -I$(SRCROOT)/shared endif +ifdef KVERSION +VM_UNAME = $(KVERSION) +else VM_UNAME = $(shell uname -r) +endif # Header directory for the running kernel ifdef LINUXINCLUDE @@ -98,6 +102,13 @@ auto-build: $(DRIVER_KO) $(DRIVER): $(DRIVER_KO) if [ $< -nt $@ ] || [ ! -e $@ ] ; then cp -f $< $@; fi +# Use SUBDIRS on 2.x, 3.x, 4.x. Use M on newer kernels. +ifeq ($(filter-out 2 3 4,$(firstword $(subst ., ,$(VM_UNAME)))),) +DIRVAR := SUBDIRS +else +DIRVAR := M +endif + # # Define a setup target that gets built before the actual driver. # This target may not be used at all, but if it is then it will be defined @@ -107,7 +118,7 @@ prebuild:: ; postbuild:: ; $(DRIVER_KO): prebuild - $(MAKE) -C $(BUILD_DIR) SUBDIRS=$$PWD SRCROOT=$$PWD/$(SRCROOT) \ + $(MAKE) -C $(BUILD_DIR) $(DIRVAR)=$$PWD SRCROOT=$$PWD/$(SRCROOT) \ MODULEBUILDDIR=$(MODULEBUILDDIR) modules $(MAKE) -C $$PWD SRCROOT=$$PWD/$(SRCROOT) \ MODULEBUILDDIR=$(MODULEBUILDDIR) postbuild --- a/vmmon/linux/driver.c +++ b/vmmon/linux/driver.c @@ -96,7 +96,9 @@ long LinuxDriver_Ioctl(struct file *filp unsigned long ioarg); static int LinuxDriver_Close(struct inode *inode, struct file *filp); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) +static vm_fault_t LinuxDriverFault(struct vm_fault *fault); +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) static int LinuxDriverFault(struct vm_fault *fault); #else static int LinuxDriverFault(struct vm_area_struct *vma, struct vm_fault *fault); @@ -594,7 +596,12 @@ LinuxDriver_Close(struct inode *inode, / *----------------------------------------------------------------------------- */ -static int +static +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) +vm_fault_t +#else +int +#endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) LinuxDriverFault(struct vm_fault *fault) //IN/OUT #else --- a/vmmon/linux/hostif.c +++ a/vmmon/linux/hostif.c @@ -47,6 +47,8 @@ #include #include #include +#include +#include #include #include #include @@ -54,6 +56,7 @@ #include #include #include // For linux/sched/signal.h without version check +#include #include "vmware.h" #include "x86apic.h" @@ -593,6 +596,15 @@ HostIF_FastClockUnlock(int callerID) // MutexUnlock(&fastClockMutex, callerID); } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) +static int crosspage_set_exec(pte_t *pte, unsigned long addr, void *data) +{ + struct page *p = data; + + set_pte(pte, mk_pte(p, VM_PAGE_KERNEL_EXEC)); + return 0; +} +#endif /* *---------------------------------------------------------------------- @@ -613,7 +625,29 @@ HostIF_FastClockUnlock(int callerID) // static void * MapCrossPage(struct page *p) // IN: { +#if COMPAT_LINUX_VERSION_CHECK_LT(5, 8, 0) return vmap(&p, 1, VM_MAP, VM_PAGE_KERNEL_EXEC); +#else + void *addr; + + addr = vmap(&p, 1, VM_MAP, VM_PAGE_KERNEL_EXEC); + if (!addr) + return NULL; + + /* Starting with 5.8, vmap() always sets the NX bit, but the cross + * page needs to be executable. */ + if (apply_to_page_range(current->mm, (unsigned long)addr, PAGE_SIZE, + crosspage_set_exec, p)) { + vunmap(addr); + return NULL; + } + + preempt_disable(); + __flush_tlb_all(); + preempt_enable(); + + return addr; +#endif } @@ -1499,9 +1533,13 @@ HostIF_EstimateLockedPageLimit(const VMD * since at least 2.6.0. */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 0, 0) extern unsigned long totalram_pages; unsigned int totalPhysicalPages = totalram_pages; +#else + unsigned int totalPhysicalPages = totalram_pages(); +#endif /* * Use the memory information linux exports as of late for a more @@ -1527,7 +1565,10 @@ HostIF_EstimateLockedPageLimit(const VMD lockedPages += global_page_state(NR_PAGETABLE); #endif /* NR_SLAB_* moved from zone to node in 4.13. */ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) + lockedPages += global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) lockedPages += global_node_page_state(NR_SLAB_UNRECLAIMABLE); #else lockedPages += global_page_state(NR_SLAB_UNRECLAIMABLE); @@ -1602,6 +1643,49 @@ HostIF_WaitForFreePages(unsigned int tim /* *---------------------------------------------------------------------- * + * HostIFGetTime -- + * + * Reads the current time in UPTIME_FREQ units. + * + * Results: + * The uptime, in units of UPTIME_FREQ. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static uint64 +HostIFGetTime(void) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 0, 0) + struct timeval tv; + + do_gettimeofday(&tv); + return tv.tv_usec * (UPTIME_FREQ / 1000000) + tv.tv_sec * UPTIME_FREQ; +#else + struct timespec64 now; + + /* + * Use raw time used by Posix timers. This time is not affected by + * NTP adjustments, so it may drift from real time and monotonic time, + * but it will stay in sync with other timers. + */ + ktime_get_raw_ts64(&now); + /* + * UPTIME_FREQ resolution is lower than tv_nsec, + * so we have to do division... + */ + ASSERT_ON_COMPILE(1000000000 % UPTIME_FREQ == 0); + return now.tv_nsec / (1000000000 / UPTIME_FREQ) + now.tv_sec * UPTIME_FREQ; +#endif +} + + +/* + *---------------------------------------------------------------------- + * * HostIFReadUptimeWork -- * * Reads the current uptime. The uptime is based on getimeofday, @@ -1630,7 +1714,6 @@ HostIF_WaitForFreePages(unsigned int tim static uint64 HostIFReadUptimeWork(unsigned long *j) // OUT: current jiffies { - struct timeval tv; uint64 monotime, uptime, upBase, monoBase; int64 diff; uint32 version; @@ -1645,13 +1728,12 @@ HostIFReadUptimeWork(unsigned long *j) monoBase = uptimeState.monotimeBase; } while (!VersionedAtomic_EndTryRead(&uptimeState.version, version)); - do_gettimeofday(&tv); + uptime = HostIFGetTime(); upBase = Atomic_Read64(&uptimeState.uptimeBase); monotime = (uint64)(jifs - jifBase) * (UPTIME_FREQ / HZ); monotime += monoBase; - uptime = tv.tv_usec * (UPTIME_FREQ / 1000000) + tv.tv_sec * UPTIME_FREQ; uptime += upBase; /* @@ -1756,13 +1838,11 @@ HostIFUptimeResyncMono(struct timer_list void HostIF_InitUptime(void) { - struct timeval tv; + uint64 tm; uptimeState.jiffiesBase = jiffies; - do_gettimeofday(&tv); - Atomic_Write64(&uptimeState.uptimeBase, - -(tv.tv_usec * (UPTIME_FREQ / 1000000) + - tv.tv_sec * UPTIME_FREQ)); + tm = HostIFGetTime(); + Atomic_Write64(&uptimeState.uptimeBase, -tm); #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0) && !defined(timer_setup) init_timer(&uptimeState.timer); @@ -2159,16 +2239,22 @@ HostIF_VMLockIsHeld(VMDriver *vm) // IN static Bool isVAReadable(VA r) // IN: { - mm_segment_t old_fs; uint32 dummy; int ret; +#ifdef HAVE_GET_KERNEL_NOFAULT + ret = get_kernel_nofault(dummy, (void *)r); +#else + { + mm_segment_t old_fs; + old_fs = get_fs(); - set_fs(get_ds()); + set_fs(KERNEL_DS); r = APICR_TO_ADDR(r, APICR_VERSION); ret = HostIF_CopyFromUser(&dummy, r, sizeof dummy); set_fs(old_fs); - + } +#endif return ret == 0; } @@ -2197,7 +2283,7 @@ SetVMAPICAddr(VMDriver *vm, // IN/OUT: d volatile void *hostapic; ASSERT_ON_COMPILE(APICR_SIZE <= PAGE_SIZE); - hostapic = (volatile void *) ioremap_nocache(ma, PAGE_SIZE); + hostapic = (volatile void *) ioremap(ma, PAGE_SIZE); if (hostapic) { if ((APIC_VERSIONREG(hostapic) & 0xF0) == 0x10) { vm->hostAPIC.base = (volatile uint32 (*)[4]) hostapic; @@ -2353,7 +2439,6 @@ HostIF_SemaphoreWait(VMDriver *vm, // uint64 *args) // IN: { struct file *file; - mm_segment_t old_fs; int res; int waitFD = args[0]; int timeoutms = args[2]; @@ -2364,9 +2449,6 @@ HostIF_SemaphoreWait(VMDriver *vm, // return MX_WAITERROR; } - old_fs = get_fs(); - set_fs(get_ds()); - { struct poll_wqueues table; unsigned int mask; @@ -2388,9 +2470,11 @@ HostIF_SemaphoreWait(VMDriver *vm, // * the code to happily deal with a pipe or an eventfd. We only care about * reading no bytes (EAGAIN - non blocking fd) or sizeof(uint64). */ - - res = file->f_op->read(file, (char *) &value, sizeof value, &file->f_pos); - +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) + res = kernel_read(file, file->f_pos, (char *)&value, sizeof value); +#else + res = kernel_read(file, &value, sizeof value, &file->f_pos); +#endif if (res == sizeof value) { res = MX_WAITNORMAL; } else { @@ -2399,7 +2483,6 @@ HostIF_SemaphoreWait(VMDriver *vm, // } } - set_fs(old_fs); fput(file); /* @@ -2482,8 +2565,8 @@ HostIF_SemaphoreForceWakeup(VMDriver *vm int HostIF_SemaphoreSignal(uint64 *args) // IN: { + struct eventfd_ctx *eventfd; struct file *file; - mm_segment_t old_fs; int res; int signalFD = args[1]; uint64 value = 1; // make an eventfd happy should it be there @@ -2493,22 +2576,32 @@ HostIF_SemaphoreSignal(uint64 *args) // return MX_WAITERROR; } - old_fs = get_fs(); - set_fs(get_ds()); + /* + * If it's eventfd, use specific eventfd interface as kernel writes + * to eventfd may not be allowed in kernel 5.10 and later. + */ + eventfd = eventfd_ctx_fileget(file); + if (!IS_ERR(eventfd)) { + eventfd_signal(eventfd, 1); + fput(file); + return MX_WAITNORMAL; + } /* * Always write sizeof(uint64) bytes. This works fine for eventfd and * pipes. The data written is formatted to make an eventfd happy should * it be present. */ - - res = file->f_op->write(file, (char *) &value, sizeof value, &file->f_pos); +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) + res = kernel_write(file, (char *)&value, sizeof value, file->f_pos); +#else + res = kernel_write(file, &value, sizeof value, &file->f_pos); +#endif if (res == sizeof value) { res = MX_WAITNORMAL; } - set_fs(old_fs); fput(file); /* @@ -3027,12 +3120,9 @@ static int HostIFFastClockThread(void *unused) // IN: { int res; - mm_segment_t oldFS; unsigned int rate = 0; unsigned int prevRate = 0; - oldFS = get_fs(); - set_fs(KERNEL_DS); allow_signal(SIGKILL); while ((rate = linuxState.fastClockRate) > MIN_RATE) { @@ -3055,8 +3145,6 @@ HostIFFastClockThread(void *unused) // } out: - set_fs(oldFS); - /* * Do not exit thread until we are told to do so. */ @@ -3154,7 +3242,6 @@ HostIF_SetFastClockRate(unsigned int rat } } else { if (linuxState.fastClockThread) { - force_sig(SIGKILL, linuxState.fastClockThread); kthread_stop(linuxState.fastClockThread); linuxState.fastClockThread = NULL; @@ -3200,7 +3287,12 @@ HostIF_MapUserMem(VA addr, ASSERT(handle); - if (!access_ok(VERIFY_WRITE, p, size)) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 0, 0) + if (!access_ok(VERIFY_WRITE, p, size)) +#else + if (!access_ok(p, size)) +#endif + { printk(KERN_ERR "%s: Couldn't verify write to uva 0x%p with size %" FMTSZ"u\n", __func__, p, size); From 9fda02bce13527ce94a95df1a98fb6188dea22b8 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Wed, 30 Jun 2021 11:05:16 +0200 Subject: [PATCH] vmmon: fix task_struct::state access patterns Mainline commit 2f064a59a11f ("sched: Change task_struct::state") in 5.14-rc1 finishes a series fixing racy access patterns to task state and renames task_struct::state to __state so that code old code acessing it directly fails to build. Two of these in HostIF_SemaphoreWait() can be rewritten into calls to set_current_state() unconditionally (second one may do with __set_current_state() but I don't feel confident enough about that). There are also two places where vmmon code reads task_struct::state; provide a compat accessor using READ_ONCE() and use it instead of a direct read. To avoid kernel version check, check presence of get_current_state() macro introduced in the same commit as state member rename. --- vmmon-only/include/compat_sched.h | 15 +++++++++++++++ vmmon-only/linux/hostif.c | 10 ++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/vmmon-only/include/compat_sched.h b/vmmon-only/include/compat_sched.h index 3f3304b..72078e0 100644 --- a/vmmon-only/include/compat_sched.h +++ b/vmmon-only/include/compat_sched.h @@ -289,5 +289,20 @@ typedef struct pid * compat_pid; #define compat_kill_pid(pid, sig, flag) kill_pid(pid, sig, flag) #endif +/* + * Since v5.14-rc1, task_struct::state hase been renamed to __state and is + * is longer supposed to be accessed without READ_ONCE/WRITE_ONCE. + */ +#ifdef get_current_state +static inline int compat_get_task_state(const struct task_struct *t) +{ + return READ_ONCE(t->__state); +} +#else +static inline int compat_get_task_state(const struct task_struct *t) +{ + return READ_ONCE(t->state); +} +#endif #endif /* __COMPAT_SCHED_H__ */ diff --git a/vmmon-only/linux/hostif.c b/vmmon-only/linux/hostif.c index 137062c..6910f69 100644 --- a/vmmon-only/linux/hostif.c +++ b/vmmon-only/linux/hostif.c @@ -78,6 +78,8 @@ #include "pgtbl.h" #include "versioned_atomic.h" +#include "compat_sched.h" + #if !defined(CONFIG_HIGH_RES_TIMERS) #error CONFIG_HIGH_RES_TIMERS required for acceptable performance #endif @@ -445,7 +447,7 @@ HostIF_WakeUpYielders(VMDriver *vm, // IN: while ((vcpuid = VCPUSet_FindFirst(&req)) != VCPUID_INVALID) { struct task_struct *t = vm->vmhost->vcpuSemaTask[vcpuid]; VCPUSet_Remove(&req, vcpuid); - if (t && (t->state & TASK_INTERRUPTIBLE)) { + if (t && (compat_get_task_state(t) & TASK_INTERRUPTIBLE)) { wake_up_process(t); } } @@ -2454,14 +2456,14 @@ HostIF_SemaphoreWait(VMDriver *vm, // IN: unsigned int mask; poll_initwait(&table); - current->state = TASK_INTERRUPTIBLE; + set_current_state(TASK_INTERRUPTIBLE); mask = file->f_op->poll(file, &table.pt); if (!(mask & (POLLIN | POLLERR | POLLHUP))) { vm->vmhost->vcpuSemaTask[vcpuid] = current; schedule_timeout(timeoutms * HZ / 1000); // convert to Hz vm->vmhost->vcpuSemaTask[vcpuid] = NULL; } - current->state = TASK_RUNNING; + set_current_state(TASK_RUNNING); poll_freewait(&table); } @@ -2535,7 +2537,7 @@ HostIF_SemaphoreForceWakeup(VMDriver *vm, // IN: FOR_EACH_VCPU_IN_SET(vcs, vcpuid) { struct task_struct *t = vm->vmhost->vcpuSemaTask[vcpuid]; vm->vmhost->vcpuSemaTask[vcpuid] = NULL; - if (t && (t->state & TASK_INTERRUPTIBLE)) { + if (t && (compat_get_task_state(t) & TASK_INTERRUPTIBLE)) { wake_up_process(t); } } ROF_EACH_VCPU_IN_SET();