Bonjour,
Je sais qu'entre nous on est des adeptes du libre ... mais il nous faut interagir avec des camarades de jeux qui eux ne semblent pas avoir de problemes a utiliser des %$# proprietaires !
Ce petit post suit le precedent concernant l'installation de VMplayer apres upgrade vers FC26 !
J'en ai un peu bave avant de reussir a installer ce machin proprietaire, les principaux soucis venant principalement de problemes de compilation, alors comme d'habitude, si les heures que j'ai passe a trouver la solution peuvent en faire gagner a d'autres, je partage ...
La procedure reste tres similaire a la precedente, sauf qu'il faudra modifier quelques fichiers avant de compiler.
Donc apres avoir fait:
# tar -xvf /usr/lib/vmware/modules/source/vmmon.tar --directory /usr/lib/vmware/modules/source
# tar -xvf /usr/lib/vmware/modules/source/vmnet.tar --directory /usr/lib/vmware/modules/source
Avant de compiler il y a quelques modifications a faire dans certains fichiers:
- ajouter le fichier /usr/lib/vmware/modules/source/vmmon-only/include/compat_timer.h, qui contiendra:
#ifndef __COMPAT_TIMER_H__
# define __COMPAT_TIMER_H__
#include <linux/timer.h>
#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) && !defined(timer_setup)
typedef unsigned long compat_timer_arg_t;
static inline void timer_setup(struct timer_list *timer,
void (*func)(compat_timer_arg_t),
unsigned int flags)
{
init_timer(timer);
timer->function = func;
timer->data = 0;
timer->flags = flags;
}
#else /* new timer interface since 4.15 */
typedef struct timer_list *compat_timer_arg_t;
#endif /* new timer interface since 4.15 */
#endif /* __COMPAT_TIMER_H__ */
Modifier le fichier /usr/lib/vmware/modules/source/vmmon-only/linux/driver.c afin d'obtenir le suivant:
/*********************************************************
* Copyright (C) 1998-2017 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation version 2 and no later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*********************************************************/
/* Must come before any kernel header file */
#include "driver-config.h"
#define EXPORT_SYMTAB
#include "compat_timer.h"
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/poll.h>
#include <linux/preempt.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/wait.h>
#include <asm/hw_irq.h> /* for CALL_FUNCTION_VECTOR */
#include "compat_version.h"
#include "compat_module.h"
#include "compat_page.h"
#include "usercalldefs.h"
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 16)
#error Linux before 2.6.16 is not supported
#endif
#include <asm/io.h>
#include "vmware.h"
#include "driverLog.h"
#include "driver.h"
#include "modulecall.h"
#include "vm_asm.h"
#include "vmx86.h"
#include "initblock.h"
#include "task.h"
#include "memtrack.h"
#include "task.h"
#include "cpuid.h"
#include "cpuid_info.h"
#include "circList.h"
#include "x86msr.h"
#ifdef VMX86_DEVEL
#include "private.h"
#endif
#include "hostif.h"
#include "hostif_priv.h"
#include "vmhost.h"
#include "vmmonInt.h"
static void LinuxDriverQueue(VMLinux *vmLinux);
static void LinuxDriverDequeue(VMLinux *vmLinux);
static Bool LinuxDriverCheckPadding(void);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24)
#define VMW_NOPAGE_2624
#endif
#define VMMON_UNKNOWN_SWAP_SIZE -1ULL
struct VMXLinuxState linuxState;
/*
*----------------------------------------------------------------------
*
* Device Driver Interface --
*
* Runs the VM by implementing open/close/ioctl functions
*
*
*----------------------------------------------------------------------
*/
static int LinuxDriver_Open(struct inode *inode, struct file *filp);
/*
* gcc-4.5+ can name-mangle LinuxDriver_Ioctl, but our stack-size
* script needs to find it. So it shouldn't be static. ("hidden"
* visibility would be OK.)
*/
long LinuxDriver_Ioctl(struct file *filp, u_int iocmd,
unsigned long ioarg);
static int LinuxDriver_Close(struct inode *inode, struct file *filp);
static unsigned int LinuxDriverPoll(struct file *file, poll_table *wait);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
static int LinuxDriverFault(struct vm_fault *fault);
#elif defined(VMW_NOPAGE_2624)
static int LinuxDriverFault(struct vm_area_struct *vma, struct vm_fault *fault);
#else
static struct page *LinuxDriverNoPage(struct vm_area_struct *vma,
unsigned long address,
int *type);
#endif
static int LinuxDriverMmap(struct file *filp, struct vm_area_struct *vma);
static void LinuxDriverPollTimeout(unsigned long clientData);
static unsigned int LinuxDriverEstimateTSCkHz(void);
static struct vm_operations_struct vmuser_mops = {
#ifdef VMW_NOPAGE_2624
.fault = LinuxDriverFault
#else
.nopage = LinuxDriverNoPage
#endif
};
static struct file_operations vmuser_fops;
static struct timer_list tscTimer;
static Atomic_uint32 tsckHz;
static VmTimeStart tsckHzStartTime;
/*
*----------------------------------------------------------------------
*
* LinuxDriverEstimateTSCkHzWork --
*
* Estimates TSC frequency in terms of cycles and system uptime
* elapsed since module init. At module init, the starting cycle
* count and uptime are recorded (in tsckHzStartTime) and a timer
* is scheduled to call this function after 4 seconds.
*
* It is possible that vmx queries the TSC rate after module init
* but before the 4s timer expires. In that case, we just go ahead
* and compute the rate for the duration since the driver loaded.
* When the timer expires, the new computed value is dropped. If the
* query races with the timer, the first thread to write to 'tsckHz'
* wins.
*
*----------------------------------------------------------------------
*/
static void
LinuxDriverEstimateTSCkHzWork(void *data)
{
VmTimeStart curTime;
uint64 cycles;
uint64 uptime;
unsigned int khz;
ASSERT(tsckHzStartTime.count != 0 && tsckHzStartTime.time != 0);
Vmx86_ReadTSCAndUptime(&curTime);
cycles = curTime.count - tsckHzStartTime.count;
uptime = curTime.time - tsckHzStartTime.time;
khz = Vmx86_ComputekHz(cycles, uptime);
if (khz != 0) {
if (Atomic_ReadIfEqualWrite(&tsckHz, 0, khz) == 0) {
Log("TSC frequency estimated using system uptime: %u\n", khz);
}
} else if (Atomic_ReadIfEqualWrite(&tsckHz, 0, cpu_khz) == 0) {
Log("Failed to compute TSC frequency, using cpu_khz: %u\n", cpu_khz);
}
}
/*
*----------------------------------------------------------------------
*
* LinuxDriverEstimateTSCkHz --
*
* Returns the estimated TSC khz, cached in tscKhz. If tsckHz is
* 0, the routine kicks off estimation work on CPU 0.
*
* Results:
*
* Returns the estimated TSC khz value.
*
*----------------------------------------------------------------------
*/
static unsigned int
LinuxDriverEstimateTSCkHz(void)
{
int err;
uint32 khz;
khz = Atomic_Read(&tsckHz);
if (khz != 0) {
return khz;
}
err = compat_smp_call_function_single(0, LinuxDriverEstimateTSCkHzWork,
NULL, 1);
/*
* The smp function call may fail for two reasons, either
* the function is not supportd by the kernel, or the cpu
* went offline. In this unlikely event, we just perform
* the work wherever we can.
*/
if (err != 0) {
LinuxDriverEstimateTSCkHzWork(NULL);
}
return Atomic_Read(&tsckHz);
}
/*
*----------------------------------------------------------------------
*
* LinuxDriverEstimateTSCkHzDeferred --
*
* Timer callback for deferred TSC rate estimation.
*
*----------------------------------------------------------------------
*/
static void
LinuxDriverEstimateTSCkHzDeferred(compat_timer_arg_t unused)
{
LinuxDriverEstimateTSCkHz();
}
/*
*----------------------------------------------------------------------
*
* LinuxDriverInitTSCkHz --
*
* Initialize TSC khz rate.
*
* We rely on the kernel estimated cycle rate in the exported
* variable tsc_khz. If the kernel has disabled tsc, tsc_khz
* will be 0, and we fall back on our own estimation routines.
*
* Side effects:
*
* If tsc_khz is unusable, schedules a 4s timer for deferred
* khz estimation (see LinuxDriverEstimateTSCkHz).
*
*----------------------------------------------------------------------
*/
static void
LinuxDriverInitTSCkHz(void)
{
unsigned int khz;
khz = compat_tsc_khz();
if (khz != 0) {
Atomic_Write(&tsckHz, khz);
Log("Using tsc_khz as TSC frequency: %u\n", khz);
return;
}
Vmx86_ReadTSCAndUptime(&tsckHzStartTime);
tscTimer.expires = jiffies + 4 * HZ;
add_timer(&tscTimer);
}
/*
*----------------------------------------------------------------------
*
* init_module --
*
* linux module entry point. Called by /sbin/insmod command
*
* Results:
* registers a device driver for a major # that depends
* on the uid. Add yourself to that list. List is now in
* private/driver-private.c.
*
*----------------------------------------------------------------------
*/
int
init_module(void)
{
int retval;
DriverLog_Init("/dev/vmmon");
HostIF_InitGlobalLock();
if (!LinuxDriverCheckPadding()) {
return -ENOEXEC;
}
CPUID_Init();
if (!Task_Initialize()) {
return -ENOEXEC;
}
/*
* Initialize LinuxDriverPoll state
*/
init_waitqueue_head(&linuxState.pollQueue);
timer_setup(&tscTimer, LinuxDriverEstimateTSCkHzDeferred, 0);
linuxState.fastClockThread = NULL;
linuxState.fastClockFile = NULL;
linuxState.fastClockRate = 0;
linuxState.fastClockPriority = -20;
linuxState.swapSize = VMMON_UNKNOWN_SWAP_SIZE;
/*
* Initialize the file_operations structure. Because this code is always
* compiled as a module, this is fine to do it here and not in a static
* initializer.
*/
memset(&vmuser_fops, 0, sizeof vmuser_fops);
vmuser_fops.owner = THIS_MODULE;
vmuser_fops.poll = LinuxDriverPoll;
vmuser_fops.unlocked_ioctl = LinuxDriver_Ioctl;
vmuser_fops.compat_ioctl = LinuxDriver_Ioctl;
vmuser_fops.open = LinuxDriver_Open;
vmuser_fops.release = LinuxDriver_Close;
vmuser_fops.mmap = LinuxDriverMmap;
#ifdef VMX86_DEVEL
devel_init_module();
linuxState.minor = 0;
retval = register_chrdev(linuxState.major, linuxState.deviceName,
&vmuser_fops);
#else
sprintf(linuxState.deviceName, "vmmon");
linuxState.major = 10;
linuxState.minor = 165;
linuxState.misc.minor = linuxState.minor;
linuxState.misc.name = linuxState.deviceName;
linuxState.misc.fops = &vmuser_fops;
retval = misc_register(&linuxState.misc);
#endif
if (retval) {
Warning("Module %s: error registering with major=%d minor=%d\n",
linuxState.deviceName, linuxState.major, linuxState.minor);
return -ENOENT;
}
Log("Module %s: registered with major=%d minor=%d\n",
linuxState.deviceName, linuxState.major, linuxState.minor);
HostIF_InitUptime();
timer_setup(&tscTimer, LinuxDriverEstimateTSCkHzDeferred, 0);
LinuxDriverInitTSCkHz();
Vmx86_InitIDList();
Log("Module %s: initialized\n", linuxState.deviceName);
return 0;
}
/*
*----------------------------------------------------------------------
*
* cleanup_module --
*
* Called by /sbin/rmmod
*
*
*----------------------------------------------------------------------
*/
void
cleanup_module(void)
{
/*
* XXX smp race?
*/
#ifdef VMX86_DEVEL
unregister_chrdev(linuxState.major, linuxState.deviceName);
#else
misc_deregister(&linuxState.misc);
#endif
Log("Module %s: unloaded\n", linuxState.deviceName);
del_timer_sync(&linuxState.pollTimer);
del_timer_sync(&tscTimer);
Task_Terminate();
// Make sure fastClockThread is dead
HostIF_FastClockLock(1);
HostIF_SetFastClockRate(0);
HostIF_FastClockUnlock(1);
HostIF_CleanupUptime();
}
/*
*----------------------------------------------------------------------
*
* LinuxDriver_Open --
*
* called on open of /dev/vmmon or /dev/vmx86.$USER. Use count used
* to determine eventual deallocation of the module
*
* Side effects:
* Increment use count used to determine eventual deallocation of
* the module
*
*----------------------------------------------------------------------
*/
static int
LinuxDriver_Open(struct inode *inode, // IN
struct file *filp) // IN
{
VMLinux *vmLinux;
vmLinux = kmalloc(sizeof *vmLinux, GFP_KERNEL);
if (vmLinux == NULL) {
return -ENOMEM;
}
memset(vmLinux, 0, sizeof *vmLinux);
sema_init(&vmLinux->lock4Gb, 1);
init_waitqueue_head(&vmLinux->pollQueue);
filp->private_data = vmLinux;
LinuxDriverQueue(vmLinux);
Vmx86_Open();
return 0;
}
/*
*-----------------------------------------------------------------------------
*
* LinuxDriverAllocPages --
*
* Allocate physically contiguous block of memory with specified order.
* Pages in the allocated block are configured so that caller can pass
* independent pages to the VM.
*
* Results:
* Zero on success, non-zero (error code) on failure.
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
static int
LinuxDriverAllocPages(unsigned int gfpFlag, // IN
unsigned int order, // IN
struct page **pg, // OUT
unsigned int size) // IN
{
struct page* page;
page = alloc_pages(gfpFlag, order);
if (page) {
unsigned int i;
/*
* Grab an extra reference on all pages except first one - first
* one was already refcounted by alloc_pages.
*
* Under normal situation all pages except first one in the block
* have refcount zero. As we pass these pages to the VM, we must
* bump their count, otherwise VM will release these pages every
* time they would be unmapped from user's process, causing crash.
*
* Note that this depends on Linux VM internals. It works on all
* kernels we care about.
*/
order = 1 << order;
for (i = 0; i < order; i++) {
if (i) {
/*
* Debug kernels assert that page->_count is not zero when
* calling get_page. We use init_page_count as a temporary
* workaround. PR 894174
*/
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 16)
ASSERT(page_count(page) == 0);
init_page_count(page);
#else
get_page(page);
#endif
}
if (i >= size) {
put_page(page);
} else {
void *addr = kmap(page);
memset(addr, 0, PAGE_SIZE);
kunmap(page);
*pg++ = page;
}
page++;
}
return 0;
}
return -ENOMEM;
}
/*
*-----------------------------------------------------------------------------
*
* LinuxDriverDestructor4Gb --
*
* Deallocate all directly mappable memory.
*
* Results:
* None
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
static void
LinuxDriverDestructor4Gb(VMLinux *vmLinux) // IN
{
unsigned int pg;
if (!vmLinux->size4Gb) {
return;
}
for (pg = 0; pg < vmLinux->size4Gb; pg++) {
put_page(vmLinux->pages4Gb[pg]);
}
vmLinux->size4Gb = 0;
}
/*
*----------------------------------------------------------------------
*
* LinuxDriver_Close --
*
* called on close of /dev/vmmon or /dev/vmx86.$USER, most often when the
* process exits. Decrement use count, allowing for possible uninstalling
* of the module.
*
*----------------------------------------------------------------------
*/
static int
LinuxDriver_Close(struct inode *inode, // IN
struct file *filp) // IN
{
VMLinux *vmLinux;
vmLinux = (VMLinux *)filp->private_data;
ASSERT(vmLinux);
LinuxDriverDequeue(vmLinux);
if (vmLinux->vm != NULL) {
Vmx86_ReleaseVM(vmLinux->vm);
vmLinux->vm = NULL;
}
Vmx86_Close();
/*
* Destroy all low memory allocations.
* We are closing the struct file here, so clearly no other process
* uses it anymore, and we do not need to hold the semaphore.
*/
LinuxDriverDestructor4Gb(vmLinux);
/*
* Clean up poll state.
*/
HostIF_PollListLock(0);
if (vmLinux->pollBack != NULL) {
if ((*vmLinux->pollBack = vmLinux->pollForw) != NULL) {
vmLinux->pollForw->pollBack = vmLinux->pollBack;
}
}
HostIF_PollListUnlock(0);
// XXX call wake_up()?
HostIF_UnmapUserMem(vmLinux->pollTimeoutHandle);
kfree(vmLinux);
filp->private_data = NULL;
return 0;
}
#define POLLQUEUE_MAX_TASK 1000
static DEFINE_SPINLOCK(pollQueueLock);
static void *pollQueue[POLLQUEUE_MAX_TASK];
static unsigned int pollQueueCount = 0;
/*
*-----------------------------------------------------------------------------
*
* LinuxDriverQueuePoll --
*
* Remember that current process waits for next timer event.
*
* Results:
* None.
*
* Side effects:
* None.
*
*-----------------------------------------------------------------------------
*/
static INLINE_SINGLE_CALLER void
LinuxDriverQueuePoll(void)
{
unsigned long flags;
spin_lock_irqsave(&pollQueueLock, flags);
/*
* Under normal circumstances every process should be listed
* only once in this array. If it becomes problem that process
* can be in the array twice, walk array! Maybe you can keep
* it sorted by 'current' value then, making IsPollQueued
* a bit faster...
*/
if (pollQueueCount < POLLQUEUE_MAX_TASK) {
pollQueue[pollQueueCount++] = current;
}
spin_unlock_irqrestore(&pollQueueLock, flags);
}
/*
*-----------------------------------------------------------------------------
*
* LinuxDriverIsPollQueued --
*
* Determine whether timer event occurred since we queued for it using
* LinuxDriverQueuePoll.
*
* Results:
* 0 Event already occurred.
* 1 Event did not occur yet.
*
* Side effects:
* None.
*
*-----------------------------------------------------------------------------
*/
static INLINE_SINGLE_CALLER int
LinuxDriverIsPollQueued(void)
{
unsigned long flags;
unsigned int i;
int retval = 0;
spin_lock_irqsave(&pollQueueLock, flags);
for (i = 0; i < pollQueueCount; i++) {
if (current == pollQueue[i]) {
retval = 1;
break;
}
}
spin_unlock_irqrestore(&pollQueueLock, flags);
return retval;
}
/*
*-----------------------------------------------------------------------------
*
* LinuxDriverFlushPollQueue --
*
* Signal to queue that timer event occurred.
*
* Results:
* None.
*
* Side effects:
* None.
*
*-----------------------------------------------------------------------------
*/
static INLINE_SINGLE_CALLER void
LinuxDriverFlushPollQueue(void)
{
unsigned long flags;
spin_lock_irqsave(&pollQueueLock, flags);
pollQueueCount = 0;
spin_unlock_irqrestore(&pollQueueLock, flags);
}
/*
*-----------------------------------------------------------------------------
*
* LinuxDriverWakeUp --
*
* Wake up processes waiting on timer event.
*
* Results:
* None.
*
* Side effects:
* None.
*
*-----------------------------------------------------------------------------
*/
void
LinuxDriverWakeUp(Bool selective) // IN:
{
if (selective && linuxState.pollList != NULL) {
struct timeval tv;
VmTimeType now;
VMLinux *p;
VMLinux *next;
HostIF_PollListLock(1);
do_gettimeofday(&tv);
now = tv.tv_sec * 1000000ULL + tv.tv_usec;
for (p = linuxState.pollList; p != NULL; p = next) {
next = p->pollForw;
if (p->pollTime <= now) {
if ((*p->pollBack = next) != NULL) {
next->pollBack = p->pollBack;
}
p->pollForw = NULL;
p->pollBack = NULL;
wake_up(&p->pollQueue);
}
}
HostIF_PollListUnlock(1);
}
LinuxDriverFlushPollQueue();
wake_up(&linuxState.pollQueue);
}
/*
*----------------------------------------------------------------------
*
* LinuxDriverPoll --
*
* This is used to wake up the VMX when a user call arrives, or
* to wake up select() or poll() at the next clock tick.
*
*----------------------------------------------------------------------
*/
static unsigned int
LinuxDriverPoll(struct file *filp, // IN:
poll_table *wait) // IN:
{
VMLinux *vmLinux = (VMLinux *) filp->private_data;
unsigned int mask = 0;
/*
* Set up or check the timeout for fast wakeup.
*
* Thanks to Petr for this simple and correct implementation:
*
* There are four cases of wait == NULL:
* another file descriptor is ready in the same poll()
* just slept and woke up
* nonblocking poll()
* did not sleep due to memory allocation on 2.4.21-9.EL
* In first three cases, it's okay to return POLLIN.
* Unfortunately, for 4th variant we have to do some
* bookkeeping to not return POLLIN when timer did not expire
* yet.
*
* We may schedule a timer unnecessarily if an existing
* timer fires between poll_wait() and timer_pending().
*
* -- edward
*/
if (wait == NULL) {
if (vmLinux->pollBack == NULL && !LinuxDriverIsPollQueued()) {
mask = POLLIN;
}
} else {
if (linuxState.fastClockThread && vmLinux->pollTimeoutPtr != NULL) {
struct timeval tv;
do_gettimeofday(&tv);
poll_wait(filp, &vmLinux->pollQueue, wait);
vmLinux->pollTime = *vmLinux->pollTimeoutPtr +
tv.tv_sec * 1000000ULL + tv.tv_usec;
if (vmLinux->pollBack == NULL) {
HostIF_PollListLock(2);
if (vmLinux->pollBack == NULL) {
if ((vmLinux->pollForw = linuxState.pollList) != NULL) {
vmLinux->pollForw->pollBack = &vmLinux->pollForw;
}
linuxState.pollList = vmLinux;
vmLinux->pollBack = &linuxState.pollList;
}
HostIF_PollListUnlock(2);
}
} else {
LinuxDriverQueuePoll();
poll_wait(filp, &linuxState.pollQueue, wait);
if (!timer_pending(&linuxState.pollTimer)) {
mod_timer(&linuxState.pollTimer, jiffies + 1);
}
}
}
return mask;
}
/*
*----------------------------------------------------------------------
*
* LinuxDriverPollTimeout --
*
* Wake up a process waiting in poll/select. This is called from
* the timer, and hence processed in the bottom half
*
*----------------------------------------------------------------------
*/
static void
LinuxDriverPollTimeout(unsigned long clientData) // IN:
{
LinuxDriverWakeUp(FALSE);
}
/*
*-----------------------------------------------------------------------------
*
* LinuxDriverNoPage/LinuxDriverFault --
*
* Callback for returning allocated page for memory mapping
*
* Results:
* NoPage:
* Page or page address on success, NULL or 0 on failure.
* Fault:
* Error code; 0, minor page fault.
*
* Side effects:
* None.
*
*-----------------------------------------------------------------------------
*/
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
static int
LinuxDriverFault(struct vm_fault *fault) //IN/OUT
#elif defined(VMW_NOPAGE_2624)
static int LinuxDriverFault(struct vm_area_struct *vma, //IN
struct vm_fault *fault) //IN/OUT
#else
static struct page *LinuxDriverNoPage(struct vm_area_struct *vma, //IN
unsigned long address, //IN
int *type) //OUT: Fault t
ype
#endif
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
struct vm_area_struct *vma = fault->vma;
#endif
VMLinux *vmLinux = (VMLinux *) vma->vm_file->private_data;
unsigned long pg;
struct page* page;
#ifdef VMW_NOPAGE_2624
pg = fault->pgoff;
#else
pg = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
#endif
pg = VMMON_MAP_OFFSET(pg);
if (pg >= vmLinux->size4Gb) {
#ifdef VMW_NOPAGE_2624
return VM_FAULT_SIGBUS;
#else
return 0;
#endif
}
page = vmLinux->pages4Gb[pg];
get_page(page);
#ifdef VMW_NOPAGE_2624
fault->page = page;
return 0;
#else
*type = VM_FAULT_MINOR;
return page;
#endif
}
/*
*-----------------------------------------------------------------------------
*
* LinuxDriverAllocContig --
*
* Create mapping for contiguous memory areas.
*
* Results:
*
* 0 on success,
* -EINVAL on invalid arguments or
* -ENOMEM on out of memory
*
* Side effects:
* Pages for mapping are allocated.
*
*-----------------------------------------------------------------------------
*/
static int LinuxDriverAllocContig(VMLinux *vmLinux,
struct vm_area_struct *vma,
unsigned long off,
unsigned long size)
{
unsigned long vmaOrder = VMMON_MAP_ORDER(off);
unsigned long vmaAllocSize;
unsigned int gfpFlag;
unsigned long i;
if (VMMON_MAP_RSVD(off)) {
/* Reserved bits set... */
return -EINVAL;
}
if (VMMON_MAP_OFFSET(off)) {
/* We do not need non-zero offsets... */
return -EINVAL;
}
switch (VMMON_MAP_MT(off)) {
case VMMON_MAP_MT_LOW4GB:
#ifdef GFP_DMA32
gfpFlag = GFP_USER | GFP_DMA32;
#else
gfpFlag = GFP_USER | GFP_DMA;
#endif
break;
case VMMON_MAP_MT_LOW16MB:
gfpFlag = GFP_USER | GFP_DMA;
break;
case VMMON_MAP_MT_ANY:
gfpFlag = GFP_HIGHUSER;
break;
default:
/* Invalid memory type */
return -EINVAL;
}
if (size > VMMON_MAP_OFFSET_MASK + 1) {
/* Size is too big to fit to our window. */
return -ENOMEM;
}
/* 16 pages looks like a good limit... */
if (size > VMMON_MAX_LOWMEM_PAGES) {
return -ENOMEM;
}
/* Sorry. Only one mmap per one open. */
down(&vmLinux->lock4Gb);
if (vmLinux->size4Gb) {
up(&vmLinux->lock4Gb);
return -EINVAL;
}
vmaAllocSize = 1 << vmaOrder;
for (i = 0; i < size; i += vmaAllocSize) {
int err;
err = LinuxDriverAllocPages(gfpFlag, vmaOrder,
vmLinux->pages4Gb + i, size - i);
if (err) {
while (i > 0) {
put_page(vmLinux->pages4Gb[--i]);
}
up(&vmLinux->lock4Gb);
return err;
}
}
vmLinux->size4Gb = size;
up(&vmLinux->lock4Gb);
vma->vm_ops = &vmuser_mops;
return 0;
}
/*
*-----------------------------------------------------------------------------
*
* LinuxDriverMmap --
*
* Create mapping for lowmem or locked memory.
*
* Results:
*
* 0 on success,
* -EINVAL on invalid arguments or
* -ENOMEM on out of memory
*
* Side effects:
* Pages for mapping are allocated.
*
*-----------------------------------------------------------------------------
*/
static int
LinuxDriverMmap(struct file *filp,
struct vm_area_struct *vma)
{
VMLinux *vmLinux = (VMLinux *) filp->private_data;
unsigned long size;
int err;
/* Only shared mappings */
if (!(vma->vm_flags & VM_SHARED)) {
return -EINVAL;
}
if ((vma->vm_end | vma->vm_start) & (PAGE_SIZE - 1)) {
return -EINVAL;
}
size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
if (size < 1) {
return -EINVAL;
}
if (vmLinux->vm) {
err = -EINVAL;
} else {
err = LinuxDriverAllocContig(vmLinux, vma, vma->vm_pgoff, size);
}
if (err) {
return err;
}
/* Clear VM_IO, otherwise SuSE's kernels refuse to do get_user_pages */
vma->vm_flags &= ~VM_IO;
return 0;
}
typedef Bool (*SyncFunc)(void *data, unsigned cpu);
typedef struct {
Atomic_uint32 numCPUs;
Atomic_uint32 ready;
Atomic_uint32 failures;
Atomic_uint32 done;
SyncFunc func;
void *data;
} SyncFuncArgs;
/*
*-----------------------------------------------------------------------------
*
* LinuxDriverSyncCallHook --
*
* Called on each CPU, waits for them all to show up, and executes
* the callback.
*
* Results:
*
* Side effects:
* Whatever side effects the callback has.
*
*-----------------------------------------------------------------------------
*/
static void
LinuxDriverSyncCallHook(void *data) // IN:
{
Bool success;
uint32 numCPUs;
volatile unsigned iterations = 1000 * 1000;
SyncFuncArgs *args = (SyncFuncArgs *)data;
unsigned cpu = smp_processor_id();
/*
* We need to careful about reading cpu_online_map on kernels that
* have hot add/remove cpu support. The kernel's smp_call_function
* blocks hot add from occuring between the time it computes the set
* of cpus it will IPI and when all those cpus have entered their IPI
* handlers. Additionally, we disabled preemption on the initiating
* cpu during the entire sync call sequence. So, since a cpu hot add
* is initiated from process context, a cpu cannot be hot added until
* at least one cpu has exited this code, and therefore it is safe
* for the first cpu to reach this point to read cpu_online_map.
*
* Hot remove works by stopping the entire machine, which is done by
* waiting for a set of kernel threads to be scheduled on all cpus.
* This cannot happen until all cpus are preemptible. Since the
* initiating cpu has preemption disabled during this entire
* sequence, this code is also safe from cpu hot remove.
*
* So, the first cpu to reach this code will read the same value of
* cpu_online_map that was used by smp_call_function, and therefore
* we can safely assume that numCPUs cpus will execute this routine.
*/
Atomic_CMPXCHG32(&args->numCPUs, 0, num_online_cpus());
numCPUs = Atomic_Read(&args->numCPUs);
Atomic_Inc(&args->ready);
/*
* Wait for all CPUs, but not forever since we could deadlock. The
* potential deadlock scenerio is this: cpu0 has IF=1 and holds a
* lock. cpu1 has IF=0 and is spinning waiting for the lock.
*/
while (Atomic_Read(&args->ready) != numCPUs && --iterations) ;
/* Now simultaneously call the routine. */
success = args->func(args->data, cpu);
if (!iterations || !success) {
/* Indicate that we either timed out or the callback failed. */
Atomic_Inc(&args->failures);
}
/* Indicate that we are finished. */
Atomic_Inc(&args->done);
}
/*
*-----------------------------------------------------------------------------
*
* LinuxDriverSyncCallOnEachCPU --
*
* Calls func on each cpu at (nearly) the same time.
*
* Results:
* TRUE if func was called at the same time on all cpus. Note that
* func is called regardless of whether all cpus showed up in time.
*
* Side effects:
* func's side effects, on all cpus.
*
*-----------------------------------------------------------------------------
*/
static Bool
LinuxDriverSyncCallOnEachCPU(SyncFunc func, // IN:
void *data) // IN:
{
SyncFuncArgs args;
uintptr_t flags;
ASSERT(HostIF_GlobalLockIsHeld());
args.func = func;
args.data = data;
Atomic_Write(&args.numCPUs, 0); // Must be calculated inside the callback.
Atomic_Write(&args.ready, 0);
Atomic_Write(&args.failures, 0);
Atomic_Write(&args.done, 0);
preempt_disable();
/*
* Call all other CPUs, but do not wait so we can enter the callback
* on this CPU too.
*/
compat_smp_call_function(LinuxDriverSyncCallHook, &args, 0);
/*
* smp_call_function doesn't return until all cpus have been
* interrupted. It's safe to disable interrupts now that all other
* cpus are in their IPI handlers.
*/
SAVE_FLAGS(flags);
CLEAR_INTERRUPTS();
LinuxDriverSyncCallHook(&args);
RESTORE_FLAGS(flags);
preempt_enable();
/*
* Wait for everyone else to finish so we can get an accurate
* failures count.
*/
while (Atomic_Read(&args.done) != Atomic_Read(&args.numCPUs)) ;
/*
* This routine failed if any CPU bailed out early to avoid deadlock,
* or the callback routine failed on any CPU. Both conditions are
* recorded in the failures field.
*/
return Atomic_Read(&args.failures) == 0;
}
/*
*-----------------------------------------------------------------------------
*
* LinuxDriverReadTSC --
*
* Callback that is executed simultaneously on all cpus to read the TSCs.
*
* Results:
* TRUE.
*
* Side effects:
* None.
*
*-----------------------------------------------------------------------------
*/
static Bool
LinuxDriverReadTSC(void *data, // OUT: TSC values
unsigned cpu) // IN: the pcpu number
{
TSCDelta *tscDelta = (TSCDelta *)data;
uint64 tsc, old;
if (LIKELY(CPUID_SSE2Supported())) {
RDTSC_BARRIER();
}
tsc = RDTSC();
/* Any looping means another CPU changed min/max. */
do {
old = Atomic_Read64(&tscDelta->min);
} while (old > tsc && !Atomic_CMPXCHG64(&tscDelta->min, &old, &tsc));
do {
old = Atomic_Read64(&tscDelta->max);
} while (old < tsc && !Atomic_CMPXCHG64(&tscDelta->max, &old, &tsc));
return TRUE;
}
/*
*-----------------------------------------------------------------------------
*
* LinuxDriverSyncReadTSCs --
*
* Simultaneously read the TSCs on all cpus.
*
* Results:
* The set of all TSCs.
*
* Side effects:
* None.
*
*-----------------------------------------------------------------------------
*/
__attribute__((always_inline)) static Bool
LinuxDriverSyncReadTSCs(uint64 *delta) // OUT: TSC max - TSC min
{
TSCDelta tscDelta;
unsigned i;
Bool okay = FALSE;
/* Take the global lock to block concurrent calls. */
HostIF_GlobalLock(14);
/* Loop to warm up the cache. */
for (i = 0; i < 3; i++) {
Atomic_Write64(&tscDelta.min, ~CONST64U(0));
Atomic_Write64(&tscDelta.max, CONST64U(0));
if (LinuxDriverSyncCallOnEachCPU(LinuxDriverReadTSC, &tscDelta)) {
/* We return the last successful simultaneous read of the TSCs. */
*delta = Atomic_Read64(&tscDelta.max) - Atomic_Read64(&tscDelta.min);
okay = TRUE;
}
}
HostIF_GlobalUnlock(14);
return okay;
}
/*
*-----------------------------------------------------------------------------
*
* LinuxDriver_Ioctl --
*
* Main path for UserRPC
*
* Be VERY careful with stack usage; gcc's stack allocation is iffy
* and allocations from individual "case" statements do not overlap,
* so it is easy to use kilobytes of stack space here.
*
* Results:
*
* Side effects:
* None.
*
*-----------------------------------------------------------------------------
*/
long
LinuxDriver_Ioctl(struct file *filp, // IN:
u_int iocmd, // IN:
unsigned long ioarg) // IN:
{
VMLinux *vmLinux = (VMLinux *) filp->private_data;
int retval = 0;
Vcpuid vcpuid;
VMDriver *vm;
if (vmLinux == NULL) {
return -EINVAL;
}
vm = vmLinux->vm;
/*
* Validate the VM pointer for those IOCTLs that require it.
*/
switch (iocmd) {
case IOCTL_VMX86_VERSION:
case IOCTL_VMX86_CREATE_VM:
case IOCTL_VMX86_INIT_CROSSGDT:
case IOCTL_VMX86_SET_UID:
case IOCTL_VMX86_GET_NUM_VMS:
case IOCTL_VMX86_GET_TOTAL_MEM_USAGE:
case IOCTL_VMX86_SET_HARD_LIMIT:
case IOCTL_VMX86_PAE_ENABLED:
case IOCTL_VMX86_VMX_ENABLED:
case IOCTL_VMX86_GET_IPI_VECTORS:
case IOCTL_VMX86_GET_KHZ_ESTIMATE:
case IOCTL_VMX86_GET_ALL_CPUID:
case IOCTL_VMX86_GET_ALL_MSRS:
case IOCTL_VMX86_SET_POLL_TIMEOUT_PTR:
case IOCTL_VMX86_GET_KERNEL_CLOCK_RATE:
case IOCTL_VMX86_GET_REFERENCE_CLOCK_HZ:
case IOCTL_VMX86_INIT_PSEUDO_TSC:
case IOCTL_VMX86_CHECK_PSEUDO_TSC:
case IOCTL_VMX86_GET_PSEUDO_TSC:
case IOCTL_VMX86_SET_HOST_CLOCK_PRIORITY:
case IOCTL_VMX86_SYNC_GET_TSCS:
case IOCTL_VMX86_GET_UNAVAIL_PERF_CTRS:
break;
default:
if (vm == NULL) {
retval = -EINVAL;
goto exit;
}
}
/*
* Perform the IOCTL operation.
*/
switch (iocmd) {
case IOCTL_VMX86_VERSION:
retval = VMMON_VERSION;
break;
case IOCTL_VMX86_CREATE_VM:
if (vm != NULL) {
retval = -EINVAL;
break;
}
vm = Vmx86_CreateVM();
if (vm == NULL) {
retval = -ENOMEM;
} else {
vmLinux->vm = vm;
retval = vm->userID;
}
break;
case IOCTL_VMX86_RELEASE_VM:
vmLinux->vm = NULL;
Vmx86_ReleaseVM(vm);
break;
case IOCTL_VMX86_ALLOC_CROSSGDT: {
InitBlock initBlock;
if (Task_AllocCrossGDT(&initBlock)) {
retval = HostIF_CopyToUser((char *)ioarg, &initBlock,
sizeof initBlock);
} else {
retval = -EINVAL;
}
break;
}
case IOCTL_VMX86_INIT_VM: {
InitBlock initParams;
retval = HostIF_CopyFromUser(&initParams, (char *)ioarg,
sizeof initParams);
if (retval != 0) {
break;
}
if (Vmx86_InitVM(vm, &initParams)) {
retval = -EINVAL;
break;
}
retval = HostIF_CopyToUser((char *)ioarg, &initParams,
sizeof initParams);
break;
}
case IOCTL_VMX86_INIT_CROSSGDT: {
InitCrossGDT initCrossGDT;
retval = HostIF_CopyFromUser(&initCrossGDT, (char *)ioarg,
sizeof initCrossGDT);
if ((retval == 0) && Task_InitCrossGDT(&initCrossGDT)) {
retval = -EIO;
}
break;
}
case IOCTL_VMX86_RUN_VM:
vcpuid = ioarg;
if (vcpuid >= vm->numVCPUs) {
retval = -EINVAL;
break;
}
retval = Vmx86_RunVM(vm, vcpuid);
break;
case IOCTL_VMX86_SET_UID:
#ifdef VMX86_DEVEL
devel_suid();
#else
retval = -EPERM;
#endif
break;
case IOCTL_VMX86_LOCK_PAGE: {
VMLockPage args;
retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
if (retval) {
break;
}
args.ret.status = Vmx86_LockPage(vm, args.uAddr, FALSE, &args.ret.mpn);
retval = HostIF_CopyToUser((void *)ioarg, &args, sizeof args);
break;
}
case IOCTL_VMX86_LOCK_PAGE_NEW: {
VMLockPage args;
retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
if (retval) {
break;
}
args.ret.status = Vmx86_LockPage(vm, args.uAddr, TRUE, &args.ret.mpn);
retval = HostIF_CopyToUser((void *)ioarg, &args, sizeof args);
break;
}
case IOCTL_VMX86_UNLOCK_PAGE: {
VA64 uAddr;
retval = HostIF_CopyFromUser(&uAddr, (void *)ioarg, sizeof uAddr);
if (retval) {
break;
}
retval = Vmx86_UnlockPage(vm, uAddr);
break;
}
case IOCTL_VMX86_UNLOCK_PAGE_BY_MPN: {
VMMUnlockPageByMPN args;
retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
if (retval) {
break;
}
retval = Vmx86_UnlockPageByMPN(vm, args.mpn, args.uAddr);
break;
}
case IOCTL_VMX86_LOOK_UP_MPN: {
VMLockPage args;
retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
if (retval) {
break;
}
args.ret.status = Vmx86_LookupUserMPN(vm, args.uAddr, &args.ret.mpn);
retval = HostIF_CopyToUser((void *)ioarg, &args, sizeof args);
break;
}
case IOCTL_VMX86_GET_NUM_VMS:
retval = Vmx86_GetNumVMs();
break;
case IOCTL_VMX86_GET_TOTAL_MEM_USAGE:
retval = Vmx86_GetTotalMemUsage();
break;
case IOCTL_VMX86_SET_HARD_LIMIT: {
int32 limit;
retval = HostIF_CopyFromUser(&limit, (void *)ioarg, sizeof limit);
if (retval != 0) {
break;
}
if (!Vmx86_SetConfiguredLockedPagesLimit(limit)) {
retval = -EINVAL;
}
break;
}
case IOCTL_VMX86_ADMIT: {
VMMemInfoArgs args;
retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
if (retval != 0) {
break;
}
Vmx86_Admit(vm, &args);
retval = HostIF_CopyToUser((void *)ioarg, &args, sizeof args);
break;
}
case IOCTL_VMX86_READMIT: {
OvhdMem_Deltas delta;
retval = HostIF_CopyFromUser(&delta, (void *)ioarg, sizeof delta);
if (retval != 0) {
break;
}
if (!Vmx86_Readmit(vm, &delta)) {
retval = -1;
}
break;
}
case IOCTL_VMX86_UPDATE_MEM_INFO: {
VMMemMgmtInfoPatch patch;
retval = HostIF_CopyFromUser(&patch, (void *)ioarg, sizeof patch);
if (retval == 0) {
Vmx86_UpdateMemInfo(vm, &patch);
}
break;
}
case IOCTL_VMX86_GET_MEM_INFO: {
VA64 uAddr;
VMMemInfoArgs *userVA;
VMMemInfoArgs in;
VMMemInfoArgs *out;
retval = HostIF_CopyFromUser(&uAddr, (void *)ioarg, sizeof uAddr);
if (retval) {
break;
}
userVA = VA64ToPtr(uAddr);
retval = HostIF_CopyFromUser(&in, userVA, sizeof in);
if (retval) {
break;
}
if (in.numVMs < 1 || in.numVMs > MAX_VMS) {
retval = -EINVAL;
break;
}
out = HostIF_AllocKernelMem(VM_GET_MEM_INFO_SIZE(in.numVMs), TRUE);
if (!out) {
retval = -ENOMEM;
break;
}
*out = in;
if (!Vmx86_GetMemInfo(vm, FALSE, out, VM_GET_MEM_INFO_SIZE(in.numVMs))) {
HostIF_FreeKernelMem(out);
retval = -ENOBUFS;
break;
}
retval = HostIF_CopyToUser(userVA, out,
VM_GET_MEM_INFO_SIZE(out->numVMs));
HostIF_FreeKernelMem(out);
break;
}
case IOCTL_VMX86_PAE_ENABLED:
retval = Vmx86_PAEEnabled();
break;
case IOCTL_VMX86_VMX_ENABLED:
retval = Vmx86_VMXEnabled();
break;
case IOCTL_VMX86_APIC_INIT: {
VMAPICInfo info;
Bool setVMPtr;
Bool probe;
retval = HostIF_CopyFromUser(&info, (VMAPICInfo *)ioarg, sizeof info);
if (retval != 0) {
break;
}
setVMPtr = ((info.flags & APIC_FLAG_DISABLE_NMI) != 0);
probe = ((info.flags & APIC_FLAG_PROBE) != 0);
/*
* Kernel uses NMIs for deadlock detection - set APIC VMptr so that
* NMIs get disabled in the monitor.
*/
setVMPtr = TRUE;
retval = HostIF_APICInit(vm, setVMPtr, probe) ? 0 : -ENODEV;
break;
}
case IOCTL_VMX86_SET_HOST_CLOCK_RATE:
retval = -Vmx86_SetHostClockRate(vm, (unsigned)ioarg);
break;
case IOCTL_VMX86_SEND_IPI: {
VCPUSet ipiTargets;
retval = HostIF_CopyFromUser(&ipiTargets, (VCPUSet *) ioarg,
sizeof ipiTargets);
if (retval == 0) {
HostIF_IPI(vm, &ipiTargets);
}
break;
}
case IOCTL_VMX86_GET_IPI_VECTORS: {
IPIVectors ipiVectors;
ipiVectors.hostIPIVectors[0] = CALL_FUNCTION_VECTOR;
#ifdef CALL_FUNCTION_SINGLE_VECTOR
ipiVectors.hostIPIVectors[1] = CALL_FUNCTION_SINGLE_VECTOR;
#else
ipiVectors.hostIPIVectors[1] = 0;
#endif
ipiVectors.monitorIPIVector = monitorIPIVector;
ipiVectors.hvIPIVector = hvIPIVector;
retval = HostIF_CopyToUser((void *)ioarg, &ipiVectors,
sizeof ipiVectors);
break;
}
case IOCTL_VMX86_GET_KHZ_ESTIMATE:
retval = LinuxDriverEstimateTSCkHz();
break;
case IOCTL_VMX86_GET_ALL_CPUID: {
VA64 uAddr;
CPUIDQuery *userVA;
CPUIDQuery in;
CPUIDQuery *out;
retval = HostIF_CopyFromUser(&uAddr, (void *)ioarg, sizeof uAddr);
if (retval) {
break;
}
userVA = VA64ToPtr(uAddr);
retval = HostIF_CopyFromUser(&in, userVA, sizeof in);
if (retval) {
break;
}
/*
* Some kernels panic on kmalloc request larger than 128KB.
* XXX This test should go inside HostIF_AllocKernelMem() then.
*/
if (in.numLogicalCPUs >
(131072 - sizeof *out) / sizeof out->logicalCPUs[0]) {
retval = -EINVAL;
break;
}
out = HostIF_AllocKernelMem(
sizeof *out + in.numLogicalCPUs * sizeof out->logicalCPUs[0],
TRUE);
if (!out) {
retval = -ENOMEM;
break;
}
*out = in;
if (!HostIF_GetAllCpuInfo(out)) {
HostIF_FreeKernelMem(out);
retval = -ENOBUFS;
break;
}
retval = HostIF_CopyToUser((int8 *)userVA + sizeof *userVA,
&out->logicalCPUs[0],
out->numLogicalCPUs * sizeof out->logicalCPUs[0]);
HostIF_FreeKernelMem(out);
break;
}
case IOCTL_VMX86_GET_ALL_MSRS: {
VA64 uAddr;
MSRQuery *userVA;
MSRQuery in;
MSRQuery *out;
retval = HostIF_CopyFromUser(&uAddr, (void *)ioarg, sizeof uAddr);
if (retval) {
break;
}
userVA = VA64ToPtr(uAddr);
retval = HostIF_CopyFromUser(&in, userVA, sizeof in);
if (retval) {
break;
}
/*
* Some kernels panic on kmalloc request larger than 128KB.
* XXX This test should go inside HostIF_AllocKernelMem() then.
*/
if (in.numLogicalCPUs >
(131072 - sizeof *out) / sizeof out->logicalCPUs[0]) {
retval = -EINVAL;
break;
}
out = HostIF_AllocKernelMem(
sizeof *out + in.numLogicalCPUs * sizeof out->logicalCPUs[0],
TRUE);
if (!out) {
retval = -ENOMEM;
break;
}
*out = in;
if (!Vmx86_GetAllMSRs(out)) {
HostIF_FreeKernelMem(out);
retval = -ENOBUFS;
break;
}
retval = HostIF_CopyToUser((int8 *)userVA + sizeof *userVA,
&out->logicalCPUs[0],
out->numLogicalCPUs * sizeof out->logicalCPUs[0]);
HostIF_FreeKernelMem(out);
break;
}
case IOCTL_VMX86_ALLOC_LOCKED_PAGES:
case IOCTL_VMX86_FREE_LOCKED_PAGES: {
VMMPNList req;
retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
if (retval) {
break;
}
if (iocmd == IOCTL_VMX86_ALLOC_LOCKED_PAGES) {
retval = Vmx86_AllocLockedPages(vm, req.mpnList,
req.mpnCount, FALSE,
req.ignoreLimits);
} else {
retval = Vmx86_FreeLockedPages(vm, req.mpnList,
req.mpnCount, FALSE);
}
break;
}
case IOCTL_VMX86_GET_NEXT_ANON_PAGE: {
VMMPNNext req;
retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
if (retval) {
req.outMPN = INVALID_MPN;
} else {
req.outMPN = Vmx86_GetNextAnonPage(vm, req.inMPN);
}
retval = HostIF_CopyToUser((void *)ioarg, &req, sizeof req);
break;
}
case IOCTL_VMX86_GET_LOCKED_PAGES_LIST: {
VMMPNList req;
retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
if (retval) {
break;
}
retval = Vmx86_GetLockedPageList(vm, req.mpnList, req.mpnCount);
break;
}
case IOCTL_VMX86_READ_PAGE: {
VMMReadWritePage req;
retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
if (retval) {
break;
}
retval = HostIF_ReadPage(vm, req.mpn, req.uAddr, FALSE);
break;
}
case IOCTL_VMX86_WRITE_PAGE: {
VMMReadWritePage req;
retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
if (retval) {
break;
}
retval = HostIF_WritePage(vm, req.mpn, req.uAddr, FALSE);
break;
}
case IOCTL_VMX86_SET_POLL_TIMEOUT_PTR: {
vmLinux->pollTimeoutPtr = NULL;
HostIF_UnmapUserMem(vmLinux->pollTimeoutHandle);
if (ioarg != 0) {
vmLinux->pollTimeoutPtr = HostIF_MapUserMem((VA)ioarg,
sizeof *vmLinux->pollTimeoutPtr,
&vmLinux->pollTimeoutHandle);
if (vmLinux->pollTimeoutPtr == NULL) {
retval = -EINVAL;
break;
}
}
break;
}
case IOCTL_VMX86_GET_KERNEL_CLOCK_RATE:
retval = HZ;
break;
case IOCTL_VMX86_FAST_SUSP_RES_SET_OTHER_FLAG:
retval = Vmx86_FastSuspResSetOtherFlag(vm, ioarg);
break;
case IOCTL_VMX86_FAST_SUSP_RES_GET_MY_FLAG:
retval = Vmx86_FastSuspResGetMyFlag(vm, ioarg);
break;
case IOCTL_VMX86_GET_REFERENCE_CLOCK_HZ: {
uint64 refClockHz = HostIF_UptimeFrequency();
retval = HostIF_CopyToUser((void *)ioarg, &refClockHz,
sizeof refClockHz);
break;
}
case IOCTL_VMX86_INIT_PSEUDO_TSC: {
PTSCInitParams params;
retval = HostIF_CopyFromUser(¶ms, (void *)ioarg, sizeof params);
if (retval != 0) {
break;
}
Vmx86_InitPseudoTSC(¶ms);
retval = HostIF_CopyToUser((void *)ioarg, ¶ms, sizeof params);
break;
}
case IOCTL_VMX86_CHECK_PSEUDO_TSC: {
PTSCCheckParams params;
retval = HostIF_CopyFromUser(¶ms, (void *)ioarg, sizeof params);
if (retval != 0) {
break;
}
params.usingRefClock = Vmx86_CheckPseudoTSC(¶ms.lastTSC,
¶ms.lastRC);
retval = HostIF_CopyToUser((void *)ioarg, ¶ms, sizeof params);
break;
}
case IOCTL_VMX86_GET_PSEUDO_TSC: {
uint64 ptsc = Vmx86_GetPseudoTSC();
retval = HostIF_CopyToUser((void *)ioarg, &ptsc, sizeof ptsc);
break;
}
case IOCTL_VMX86_SET_HOST_CLOCK_PRIORITY:
/*
* This affects the global fast clock priority, and it only
* takes effect when the fast clock rate transitions from zero
* to a non-zero value.
*
* This is used to allow VMs to optionally work around
* bug 218750 by disabling our default priority boost. If any
* VM chooses to apply this workaround, the effect is permanent
* until vmmon is reloaded!
*/
HostIF_FastClockLock(3);
linuxState.fastClockPriority = MAX(-20, MIN(19, (int)ioarg));
HostIF_FastClockUnlock(3);
retval = 0;
break;
case IOCTL_VMX86_SYNC_GET_TSCS: {
uint64 delta;
if (LinuxDriverSyncReadTSCs(&delta)) {
retval = HostIF_CopyToUser((void *)ioarg, &delta, sizeof delta);
} else {
retval = -EBUSY;
}
break;
}
case IOCTL_VMX86_SET_HOST_SWAP_SIZE: {
uint64 swapSize;
retval = HostIF_CopyFromUser(&swapSize, (void *)ioarg, sizeof swapSize);
if (retval != 0) {
Warning("Could not copy swap size from user, status %d\n", retval);
break;
}
linuxState.swapSize = swapSize;
break;
}
case IOCTL_VMX86_GET_UNAVAIL_PERF_CTRS: {
uint64 ctrs = Vmx86_GetUnavailablePerfCtrs();
retval = HostIF_CopyToUser((void *)ioarg, &ctrs, sizeof ctrs);
break;
}
default:
Warning("Unknown ioctl %d\n", iocmd);
retval = -EINVAL;
}
exit:
return retval;
}
/*
*----------------------------------------------------------------------
*
* LinuxDriverQueue --
*
* add the vmLinux to the global queue
*
* Results:
*
* void
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
static void
LinuxDriverQueue(VMLinux *vmLinux) // IN/OUT:
{
/*
* insert in global vm queue
*/
HostIF_GlobalLock(12);
vmLinux->next = linuxState.head;
linuxState.head = vmLinux;
HostIF_GlobalUnlock(12);
}
/*
*----------------------------------------------------------------------
*
* LinuxDriveDequeue --
*
* remove from active list
*
* Results:
*
* void
* Side effects:
* printk if it is not in the list (error condition)
*
*----------------------------------------------------------------------
*/
static void
LinuxDriverDequeue(VMLinux *vmLinux) // IN/OUT:
{
VMLinux **p;
HostIF_GlobalLock(13);
for (p = &linuxState.head; *p != vmLinux; p = &(*p)->next) {
ASSERT(*p != NULL);
}
*p = vmLinux->next;
vmLinux->next = NULL;
HostIF_GlobalUnlock(13);
}
/*
*----------------------------------------------------------------------
*
* CheckPadding --
*
* check for expected padding --
* this check currently fails on the egcs compiler
*
* Results:
*
* TRUE if the check succeeds -- module will be loaded
*
*
*
* Side effects:
* output to kernel log on error
*
*----------------------------------------------------------------------
*/
static Bool
LinuxDriverCheckPadding(void)
{
DTRWords32 dtr;
uint16 *x;
memset(&dtr, 0, sizeof dtr);
dtr.dtr.limit = 0x1111;
dtr.dtr.offset = 0x22223333;
x = (uint16 *) &dtr;
if (x[0] == 0x1111 && x[1] == 0x3333 && x[2] == 0x2222) {
} else {
Warning("DTR padding\n");
goto error;
}
return TRUE;
error:
printk("/dev/vmmon: Cannot load module. Use standard gcc compiler\n");
return FALSE;
}
MODULE_AUTHOR("VMware, Inc.");
MODULE_DESCRIPTION("VMware Virtual Machine Monitor.");
MODULE_LICENSE("GPL v2");
/*
* Starting with SLE10sp2, Novell requires that IHVs sign a support agreement
* with them and mark their kernel modules as externally supported via a
* change to the module header. If this isn't done, the module will not load
* by default (i.e., neither mkinitrd nor modprobe will accept it).
*/
MODULE_INFO(supported, "external");
Modifier le fichier /usr/lib/vmware/modules/source/vmmon-only/linux/hostif.c afin d'obtenir le suivant:
/*********************************************************
* Copyright (C) 1998-2017 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation version 2 and no later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*********************************************************/
/*
* hostif.c --
*
* This file implements the platform-specific (here Linux) interface that
* the cross-platform code uses --hpreg
*
*/
/* Must come before any kernel header file --hpreg */
#include "driver-config.h"
/* Must come before vmware.h --hpreg */
#include "compat_timer.h"
#include <linux/binfmts.h>
#include <linux/delay.h>
#include <linux/file.h>
#include <linux/kernel.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/preempt.h>
#include <linux/poll.h>
#include <linux/mman.h>
#include <linux/smp.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
# include <asm/asm.h>
#endif
#if defined(_ASM_EXTABLE)
# define VMW_ASM_EXTABLE(from, to) _ASM_EXTABLE(from, to)
#else
/* Compat version copied from asm.h of 2.6.25 kernel */
# define VMW_ASM_FORM(x) " " #x " "
# define VMW_ASM_EX_SEC " .section __ex_table,\"a\"\n"
# ifdef CONFIG_X86_32
# define VMW_ASM_SEL(a,b) VMW_ASM_FORM(a)
# else
# define VMW_ASM_SEL(a,b) VMW_ASM_FORM(b)
# endif
# define VMW_ASM_PTR VMW_ASM_SEL(.long, .quad)
# define VMW_ASM_ALIGN VMW_ASM_SEL(.balign 4, .balign 8)
# define VMW_ASM_EXTABLE(from,to) \
VMW_ASM_EX_SEC \
VMW_ASM_ALIGN "\n" \
VMW_ASM_PTR #from "," #to "\n" \
" .previous\n"
#endif
#include <asm/io.h>
#include <asm/uaccess.h>
#include <linux/mc146818rtc.h>
#include <linux/capability.h>
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/signal.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
#include <linux/taskstats_kern.h> // For linux/sched/signal.h without version check
#endif
#include "vmware.h"
#include "x86apic.h"
#include "vm_asm.h"
#include "modulecall.h"
#include "driver.h"
#include "memtrack.h"
#include "phystrack.h"
#include "cpuid.h"
#include "cpuid_info.h"
#include "hostif.h"
#include "hostif_priv.h"
#include "vmhost.h"
#include "x86msr.h"
#include "apic.h"
#include "memDefaults.h"
#include "vcpuid.h"
#include "pgtbl.h"
#include "vmmonInt.h"
#include "versioned_atomic.h"
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0)
# define global_zone_page_state global_page_state
#endif
static unsigned long get_nr_slab_unreclaimable(void)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0)
return global_node_page_state(NR_SLAB_UNRECLAIMABLE);
#else
return global_page_state(NR_SLAB_UNRECLAIMABLE);
#endif
}
static unsigned long get_nr_unevictable(void)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0)
return global_node_page_state(NR_UNEVICTABLE);
#else
return global_page_state(NR_UNEVICTABLE);
#endif
}
static unsigned long get_nr_anon_mapped(void)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0)
return global_node_page_state(NR_ANON_MAPPED);
#else
return global_page_state(NR_ANON_PAGES);
#endif
}
/*
* Determine if we can use high resolution timers.
*/
#ifdef CONFIG_HIGH_RES_TIMERS
# include <linux/hrtimer.h>
# define VMMON_USE_HIGH_RES_TIMERS
# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
# define VMMON_USE_SCHEDULE_HRTIMEOUT
# else
# define VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT
static void HostIFWakeupClockThread(unsigned long data);
static DECLARE_TASKLET(timerTasklet, HostIFWakeupClockThread, 0);
# endif
# define close_rtc(filp, files) do {} while(0)
#else
# define close_rtc(filp, files) filp_close(filp, files)
#endif
#define UPTIME_FREQ CONST64(1000000)
/*
* When CONFIG_NO_HZ_FULL is set processors can run tickless
* if there is only one runnable process. When set, the rate
* checks in HostIF_SetFastClockRate and HostIFFastClockThread
* need to be relaxed to allow any non-zero rate to run.
*
* This code can potentially be removed if/when we stop using
* HostIFFastClockThread to drive MonTimer. See PR1088247.
*/
#ifdef CONFIG_NO_HZ_FULL
#define MIN_RATE (0)
#else
#define MIN_RATE ((HZ) + (HZ) / 16)
#endif
/*
* Linux seems to like keeping free memory around 30MB
* even under severe memory pressure. Let's give it a little
* more leeway than that for safety.
*/
#define LOCKED_PAGE_SLACK 10000
static struct {
Atomic_uint64 uptimeBase;
VersionedAtomic version;
uint64 monotimeBase;
unsigned long jiffiesBase;
struct timer_list timer;
} uptimeState;
/*
* First Page Locking strategy
* ---------------------------
*
* An early implementation hacked the lock bit for the purpose of locking
* memory. This had a couple of advantages:
* - the vmscan algorithm would never eliminate mappings from the process
* address space
* - easy to assert that things are ok
* - it worked with anonymous memory. Basically, vmscan jumps over these
* pages, their use count stays high, ....
*
* This approach however had a couple of problems:
*
* - it relies on an undocumented interface. (in another words, a total hack)
* - it creates deadlock situations if the application gets a kill -9 or
* otherwise dies ungracefully. linux first tears down the address space,
* then closes file descriptors (including our own device). Unfortunately,
* this leads to a deadlock of the process on pages with the lock bit set.
*
* There is a workaround for that, namely to detect that condition using
* a linux timer. (ugly)
*
* Current Page Locking strategy
* -----------------------------
*
* The current scheme does not use the lock bit, rather it increments the use
* count on the pages that need to be locked down in memory.
*
* The problem is that experiments on certain linux systems (e.g. 2.2.0-pre9)
* showed that linux somehow swaps out anonymous pages, even with the
* increased ref counter.
* Swapping them out to disk is not that big of a deal, but bringing them back
* to a different location is. In any case, anonymous pages in linux are not
* intended to be write-shared (e.g. try to MAP_SHARED /dev/zero).
*
* As a result, the current locking strategy requires that all locked pages are
* backed by the filesystem, not by swap. For now, we use both mapped files and
* sys V shared memory. The user application is responsible to cover these
* cases.
*
*/
#define HOST_UNLOCK_PFN(_vm, _pfn) do { \
_vm = _vm; \
put_page(pfn_to_page(_pfn)); \
} while (0)
#define HOST_UNLOCK_PFN_BYMPN(_vm, _pfn) do { \
PhysTrack_Remove((_vm)->vmhost->lockedPages, (_pfn)); \
put_page(pfn_to_page(_pfn)); \
} while (0)
uint8 monitorIPIVector;
uint8 hvIPIVector;
/*
*-----------------------------------------------------------------------------
*
* MutexInit --
*
* Initialize a Mutex. --hpreg
*
* Results:
* None
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
#ifdef VMX86_DEBUG
static INLINE void
MutexInit(Mutex *mutex, // IN
char const *name) // IN
{
ASSERT(mutex);
ASSERT(name);
sema_init(&mutex->sem, 1);
mutex->name = name;
mutex->cur.pid = -1;
}
#else
# define MutexInit(_mutex, _name) sema_init(&(_mutex)->sem, 1)
#endif
#ifdef VMX86_DEBUG
/*
*-----------------------------------------------------------------------------
*
* MutexIsLocked --
*
* Determine if a Mutex is locked by the current thread. --hpreg
*
* Results:
* TRUE if yes
* FALSE if no
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
static INLINE Bool
MutexIsLocked(Mutex *mutex) // IN
{
ASSERT(mutex);
return mutex->cur.pid == current->pid;
}
#endif
/*
*-----------------------------------------------------------------------------
*
* MutexLock --
*
* Acquire a Mutex. --hpreg
*
* Results:
* None
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
#ifdef VMX86_DEBUG
static INLINE void
MutexLock(Mutex *mutex, // IN
int callerID) // IN
{
ASSERT(mutex);
ASSERT(!MutexIsLocked(mutex));
down(&mutex->sem);
mutex->cur.pid = current->pid;
mutex->cur.callerID = callerID;
}
#else
# define MutexLock(_mutex, _callerID) down(&(_mutex)->sem)
#endif
/*
*-----------------------------------------------------------------------------
*
* MutexUnlock --
*
* Release a Mutex. --hpreg
*
* Results:
* None
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
#ifdef VMX86_DEBUG
static INLINE void
MutexUnlock(Mutex *mutex, // IN
int callerID) // IN
{
ASSERT(mutex);
ASSERT(MutexIsLocked(mutex) && mutex->cur.callerID == callerID);
mutex->prev = mutex->cur;
mutex->cur.pid = -1;
up(&mutex->sem);
}
#else
# define MutexUnlock(_mutex, _callerID) up(&(_mutex)->sem)
#endif
/* This mutex protects the driver-wide state. --hpreg */
static Mutex globalMutex;
/*
* This mutex protects the fast clock rate and is held while
* creating/destroying the fastClockThread. It ranks below
* globalMutex. We can't use globalMutex for this purpose because the
* fastClockThread itself acquires the globalMutex, so trying to hold
* the mutex while destroying the thread can cause a deadlock.
*/
static Mutex fastClockMutex;
/* This mutex protects linuxState.pollList. */
static Mutex pollListMutex;
/*
*----------------------------------------------------------------------
*
* HostIF_PrepareWaitForThreads --
*
* Prepare to wait for another vCPU thread.
*
* Results:
* FALSE: no way on Linux to determine we've already been signalled.
*
* Side effects:
* Current task is interruptible.
*
*----------------------------------------------------------------------
*/
Bool
HostIF_PrepareWaitForThreads(VMDriver *vm, // IN:
Vcpuid currVcpu) // IN:
{
set_current_state(TASK_INTERRUPTIBLE);
vm->vmhost->vcpuSemaTask[currVcpu] = current;
return FALSE;
}
/*
*----------------------------------------------------------------------
*
* HostIF_WaitForThreads --
*
* Wait for another vCPU thread.
*
* Results:
* None.
*
* Side effects:
* Current task may block.
*
*----------------------------------------------------------------------
*/
void
HostIF_WaitForThreads(VMDriver *vm, // UNUSED:
Vcpuid currVcpu) // UNUSED:
{
#ifdef VMMON_USE_SCHEDULE_HRTIMEOUT
ktime_t timeout = ktime_set(0, CROSSCALL_SLEEP_US * 1000);
schedule_hrtimeout(&timeout, HRTIMER_MODE_REL);
#else
/* Fallback to ms timer resolution is fine for older kernels. */
schedule_timeout(msecs_to_jiffies(CROSSCALL_SLEEP_US / 1000) + 1);
#endif
}
/*
*----------------------------------------------------------------------
*
* HostIF_CancelWaitForThreads --
*
* Cancel waiting for another vCPU thread.
*
* Results:
* None.
*
* Side effects:
* Current task is running and no longer interruptible.
*
*----------------------------------------------------------------------
*/
void
HostIF_CancelWaitForThreads(VMDriver *vm, // IN:
Vcpuid currVcpu) // IN:
{
vm->vmhost->vcpuSemaTask[currVcpu] = NULL;
set_current_state(TASK_RUNNING);
}
/*
*----------------------------------------------------------------------
*
* HostIF_WakeUpYielders --
*
* Wakeup vCPUs that are waiting for the current vCPU.
*
* Results:
* The requested vCPUs are nudged if they are sleeping due to
* Vmx86_YieldToSet.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
void
HostIF_WakeUpYielders(VMDriver *vm, // IN:
Vcpuid currVcpu) // IN:
{
VCPUSet req;
Vcpuid vcpuid;
uint64 subset;
/*
* PR 1142958: if the VCPUs woken in the crosscallWaitSet re-add themselves
* to this set faster than it can be fully drained, this function never
* exits. Instead, we copy and remove a snapshot of the crosscallWaitSet
* and locally wake up just that snapshot. It is ok that we don't get a
* fully coherent snapshot, as long as the subset copy-and-remove is atomic
* so no VCPU added is lost entirely.
*/
VCPUSet_Empty(&req);
FOR_EACH_SUBSET_IN_SET(subIdx) {
subset = VCPUSet_AtomicReadWriteSubset(&vm->crosscallWaitSet[currVcpu],
0, subIdx);
VCPUSet_UnionSubset(&req, subset, subIdx);
} ROF_EACH_SUBSET_IN_SET();
preempt_disable();
while ((vcpuid = VCPUSet_FindFirst(&req)) != VCPUID_INVALID) {
struct task_struct *t = vm->vmhost->vcpuSemaTask[vcpuid];
VCPUSet_Remove(&req, vcpuid);
if (t && (t->state & TASK_INTERRUPTIBLE)) {
wake_up_process(t);
}
}
preempt_enable();
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_InitGlobalLock --
*
* Initialize the global (across all VMs and vmmon) locks.
*
* Results:
* None
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
void
HostIF_InitGlobalLock(void)
{
MutexInit(&globalMutex, "global");
MutexInit(&fastClockMutex, "fastClock");
MutexInit(&pollListMutex, "pollList");
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_GlobalLock --
*
* Grabs the global data structure lock.
*
* Results:
* None
*
* Side effects:
* Should be a very low contention lock.
* The current thread is rescheduled if the lock is busy.
*
*-----------------------------------------------------------------------------
*/
void
HostIF_GlobalLock(int callerID) // IN
{
MutexLock(&globalMutex, callerID);
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_GlobalUnlock --
*
* Releases the global data structure lock.
*
* Results:
* None
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
void
HostIF_GlobalUnlock(int callerID) // IN
{
MutexUnlock(&globalMutex, callerID);
}
#ifdef VMX86_DEBUG
/*
*-----------------------------------------------------------------------------
*
* HostIF_GlobalLockIsHeld --
*
* Determine if the global lock is held by the current thread.
*
* Results:
* TRUE if yes
* FALSE if no
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
Bool
HostIF_GlobalLockIsHeld(void)
{
return MutexIsLocked(&globalMutex);
}
#endif
/*
*-----------------------------------------------------------------------------
*
* HostIF_FastClockLock --
*
* Grabs the fast clock data structure lock.
*
* Results:
* None
*
* Side effects:
* Should be a very low contention lock.
* The current thread is rescheduled if the lock is busy.
*
*-----------------------------------------------------------------------------
*/
void
HostIF_FastClockLock(int callerID) // IN
{
MutexLock(&fastClockMutex, callerID);
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_FastClockUnlock --
*
* Releases the fast clock data structure lock.
*
* Results:
* None
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
void
HostIF_FastClockUnlock(int callerID) // IN
{
MutexUnlock(&fastClockMutex, callerID);
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_PollListLock --
*
* Grabs the linuxState.pollList lock.
*
* Results:
* None
*
* Side effects:
* The current thread is rescheduled if the lock is busy.
*
*-----------------------------------------------------------------------------
*/
void
HostIF_PollListLock(int callerID) // IN
{
MutexLock(&pollListMutex, callerID);
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_PollListUnlock --
*
* Releases the linuxState.pollList lock.
*
* Results:
* None
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
void
HostIF_PollListUnlock(int callerID) // IN
{
MutexUnlock(&pollListMutex, callerID);
}
/*
*----------------------------------------------------------------------
*
* MapCrossPage & UnmapCrossPage
*
* Both x86-64 and ia32 need to map crosspage to an executable
* virtual address. We use the vmap interface instead of kmap
* due to bug 43907.
*
* Side effects:
*
* UnmapCrossPage assumes that the page has been refcounted up
* so it takes care of the put_page.
*
*----------------------------------------------------------------------
*/
static void *
MapCrossPage(struct page *p) // IN:
{
return vmap(&p, 1, VM_MAP, VM_PAGE_KERNEL_EXEC);
}
static void
UnmapCrossPage(struct page *p, // IN:
void *va) // IN:
{
vunmap(va);
put_page(p);
}
/*
*----------------------------------------------------------------------
*
* HostIFHostMemInit --
*
* Initialize per-VM pages lists.
*
* Results:
* 0 on success,
* non-zero on failure.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
static int
HostIFHostMemInit(VMDriver *vm) // IN:
{
VMHost *vmh = vm->vmhost;
vmh->lockedPages = PhysTrack_Alloc(vm);
if (!vmh->lockedPages) {
return -1;
}
vmh->AWEPages = PhysTrack_Alloc(vm);
if (!vmh->AWEPages) {
return -1;
}
return 0;
}
/*
*----------------------------------------------------------------------
*
* HostIFHostMemCleanup --
*
* Release per-VM pages lists.
*
* Results:
* None.
*
* Side effects:
* Locked and AWE pages are released.
*
*----------------------------------------------------------------------
*/
static void
HostIFHostMemCleanup(VMDriver *vm) // IN:
{
MPN mpn;
VMHost *vmh = vm->vmhost;
if (!vmh) {
return;
}
HostIF_VMLock(vm, 32); // Debug version of PhysTrack wants VM's lock.
if (vmh->lockedPages) {
for (mpn = 0;
INVALID_MPN != (mpn = PhysTrack_GetNext(vmh->lockedPages, mpn));) {
HOST_UNLOCK_PFN_BYMPN(vm, mpn);
}
PhysTrack_Free(vmh->lockedPages);
vmh->lockedPages = NULL;
}
if (vmh->AWEPages) {
for (mpn = 0;
INVALID_MPN != (mpn = PhysTrack_GetNext(vmh->AWEPages, mpn));) {
PhysTrack_Remove(vmh->AWEPages, mpn);
put_page(pfn_to_page(mpn));
}
PhysTrack_Free(vmh->AWEPages);
vmh->AWEPages = NULL;
}
HostIF_VMUnlock(vm, 32);
}
/*
*----------------------------------------------------------------------
*
* HostIF_AllocMachinePage --
*
* Alloc non-swappable memory page. The page is not billed to
* a particular VM. Preferably the page should not be mapped into
* the kernel addresss space.
*
* Results:
* INVALID_MPN or a valid host mpn.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
MPN
HostIF_AllocMachinePage(void)
{
struct page *pg = alloc_page(GFP_HIGHUSER);
return (pg) ? ((MPN)page_to_pfn(pg)) : INVALID_MPN;
}
/*
*----------------------------------------------------------------------
*
* HostIF_FreeMachinePage --
*
* Free an anonymous machine page allocated by
* HostIF_AllocMachinePage(). This page is not tracked in any
* phystracker.
*
* Results:
* Host page is unlocked.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
void
HostIF_FreeMachinePage(MPN mpn) // IN:
{
struct page *pg = pfn_to_page(mpn);
__free_page(pg);
}
/*
*----------------------------------------------------------------------
*
* HostIF_AllocLockedPages --
*
* Alloc non-swappable memory.
*
* Results:
* negative value on complete failure
* non-negative value on partial/full completion, number of MPNs
* allocated & filled in pmpn returned.
*
* Side effects:
* Pages allocated.
*
*----------------------------------------------------------------------
*/
int
HostIF_AllocLockedPages(VMDriver *vm, // IN: VM instance pointer
VA64 addr, // OUT: pointer to user or kernel buffer for MPNs
unsigned numPages, // IN: number of pages to allocate
Bool kernelMPNBuffer)// IN: is the MPN buffer in kernel or user address space?
{
MPN *pmpn = VA64ToPtr(addr);
VMHost *vmh = vm->vmhost;
unsigned int cnt;
int err = 0;
if (!vmh || !vmh->AWEPages) {
return -EINVAL;
}
for (cnt = 0; cnt < numPages; cnt++) {
struct page* pg;
MPN mpn;
pg = alloc_page(GFP_HIGHUSER);
if (!pg) {
err = -ENOMEM;
break;
}
mpn = (MPN)page_to_pfn(pg);
if (kernelMPNBuffer) {
*pmpn = mpn;
} else if (HostIF_CopyToUser(pmpn, &mpn, sizeof *pmpn) != 0) {
__free_page(pg);
err = -EFAULT;
break;
}
pmpn++;
if (PhysTrack_Test(vmh->AWEPages, mpn)) {
Warning("%s: duplicate MPN %016" FMT64 "x\n", __func__, mpn);
}
PhysTrack_Add(vmh->AWEPages, mpn);
}
return cnt ? cnt : err;
}
/*
*----------------------------------------------------------------------
*
* HostIF_FreeLockedPages --
*
* Free non-swappable memory.
*
* Results:
* On success: 0. All pages were unlocked.
* On failure: Non-zero system error code. No page was unlocked.
*
* Side effects:
* Pages freed.
*
*----------------------------------------------------------------------
*/
int
HostIF_FreeLockedPages(VMDriver *vm, // IN: VM instance pointer
VA64 addr, // IN: user or kernel array of MPNs
unsigned numPages, // IN: number of pages to free
Bool kernelMPNBuffer) // IN: is the MPN buffer in kernel or user address space?
{
const int MPN_BATCH = 64;
MPN const *pmpn = VA64ToPtr(addr);
VMHost *vmh = vm->vmhost;
unsigned int cnt;
struct page *pg;
MPN *mpns;
mpns = HostIF_AllocKernelMem(sizeof *mpns * MPN_BATCH, TRUE);
if (mpns == NULL) {
return -ENOMEM;
}
if (!vmh || !vmh->AWEPages) {
HostIF_FreeKernelMem(mpns);
return -EINVAL;
}
if (!kernelMPNBuffer) {
if (numPages > MPN_BATCH) {
HostIF_FreeKernelMem(mpns);
return -EINVAL;
}
if (HostIF_CopyFromUser(mpns, pmpn, numPages * sizeof *pmpn)) {
printk(KERN_DEBUG "Cannot read from process address space at %p\n",
pmpn);
HostIF_FreeKernelMem(mpns);
return -EINVAL;
}
pmpn = mpns;
}
for (cnt = 0; cnt < numPages; cnt++) {
if (!PhysTrack_Test(vmh->AWEPages, pmpn[cnt])) {
printk(KERN_DEBUG "Attempted to free unallocated MPN %016" FMT64 "X\n",
pmpn[cnt]);
HostIF_FreeKernelMem(mpns);
return -EINVAL;
}
pg = pfn_to_page(pmpn[cnt]);
if (page_count(pg) != 1) {
// should this case be considered a failure?
printk(KERN_DEBUG "Page %016" FMT64 "X is still used by someone "
"(use count %u, VM %p)\n", pmpn[cnt],
page_count(pg), vm);
}
}
for (cnt = 0; cnt < numPages; cnt++) {
pg = pfn_to_page(pmpn[cnt]);
PhysTrack_Remove(vmh->AWEPages, pmpn[cnt]);
__free_page(pg);
}
HostIF_FreeKernelMem(mpns);
return 0;
}
/*
*----------------------------------------------------------------------
*
* HostIF_Init --
*
* Initialize the host-dependent part of the driver.
*
* Results:
* zero on success, non-zero on error.
*
* Side effects:
* None
*
*----------------------------------------------------------------------
*/
int
HostIF_Init(VMDriver *vm) // IN:
{
vm->memtracker = MemTrack_Init(vm);
if (vm->memtracker == NULL) {
return -1;
}
vm->vmhost = (VMHost *) HostIF_AllocKernelMem(sizeof *vm->vmhost, TRUE);
if (vm->vmhost == NULL) {
return -1;
}
memset(vm->vmhost, 0, sizeof *vm->vmhost);
if (HostIFHostMemInit(vm)) {
return -1;
}
MutexInit(&vm->vmhost->vmMutex, "vm");
return 0;
}
/*
*------------------------------------------------------------------------------
*
* HostIF_LookupUserMPN --
*
* Lookup the MPN of a locked user page by user VA.
*
* Results:
* A status code and the MPN on success.
*
* Side effects:
* None
*
*------------------------------------------------------------------------------
*/
int
HostIF_LookupUserMPN(VMDriver *vm, // IN: VMDriver
VA64 uAddr, // IN: user VA of the page
MPN *mpn) // OUT
{
void *uvAddr = VA64ToPtr(uAddr);
int retval = PAGE_LOCK_SUCCESS;
*mpn = PgtblVa2MPN((VA)uvAddr);
/*
* On failure, check whether the page is locked.
*
* While we don't require the page to be locked by HostIF_LockPage(),
* it does provide extra information.
*
* -- edward
*/
if (*mpn == INVALID_MPN) {
if (vm == NULL) {
retval += PAGE_LOOKUP_NO_VM;
} else {
MemTrackEntry *entryPtr =
MemTrack_LookupVPN(vm->memtracker, PTR_2_VPN(uvAddr));
if (entryPtr == NULL) {
retval += PAGE_LOOKUP_NOT_TRACKED;
} else if (entryPtr->mpn == 0) {
retval += PAGE_LOOKUP_NO_MPN;
} else {
/*
* Kernel can remove PTEs/PDEs from our pagetables even if pages
* are locked...
*/
volatile int c;
get_user(c, (char *)uvAddr);
*mpn = PgtblVa2MPN((VA)uvAddr);
if (*mpn == entryPtr->mpn) {
#ifdef VMX86_DEBUG
printk(KERN_DEBUG "Page %p disappeared from %s(%u)... "
"now back at %016" FMT64 "x\n",
uvAddr, current->comm, current->pid, *mpn);
#endif
} else if (*mpn != INVALID_MPN) {
printk(KERN_DEBUG "Page %p disappeared from %s(%u)... "
"now back at %016" FMT64"x (old=%016" FMT64 "x)\n",
uvAddr, current->comm, current->pid, *mpn,
entryPtr->mpn);
*mpn = INVALID_MPN;
} else {
printk(KERN_DEBUG "Page %p disappeared from %s(%u)... "
"and is lost (old=%016" FMT64 "x)\n", uvAddr, current->comm,
current->pid, entryPtr->mpn);
*mpn = entryPtr->mpn;
}
}
}
}
return retval;
}
/*
*----------------------------------------------------------------------
*
* HostIF_InitFP --
*
* masks IRQ13 if not previously the case.
*
* Results:
* prevents INTR #0x2d (IRQ 13) from being generated --
* assume that Int16 works for interrupt reporting
*
*
* Side effects:
* PIC
*
*----------------------------------------------------------------------
*/
void
HostIF_InitFP(VMDriver *vm) // IN:
{
int mask = (1 << (0xD - 0x8));
uint8 val = inb(0xA1);
if (!(val & mask)) {
val = val | mask;
outb(val, 0xA1);
}
}
/*
*-----------------------------------------------------------------------------
*
* HostIFGetUserPages --
*
* Lock the pages of an user-level address space in memory.
* If ppages is NULL, pages are only marked as dirty.
*
* Results:
* Zero on success, non-zero on failure.
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
static int
HostIFGetUserPages(void *uvAddr, // IN
struct page **ppages, // OUT
unsigned int numPages) // IN
{
int retval;
down_read(¤t->mm->mmap_sem);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)
retval = get_user_pages((unsigned long)uvAddr, numPages, 0, ppages, NULL);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
retval = get_user_pages((unsigned long)uvAddr, numPages, 0, 0, ppages, NULL);
#else
retval = get_user_pages(current, current->mm, (unsigned long)uvAddr,
numPages, 0, 0, ppages, NULL);
#endif
up_read(¤t->mm->mmap_sem);
return retval != numPages;
}
/*
*----------------------------------------------------------------------
*
* HostIF_IsLockedByMPN --
*
* Checks if mpn was locked using allowMultipleMPNsPerVA.
*
* Results:
* TRUE if mpn is present in the physTracker.
*
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
Bool
HostIF_IsLockedByMPN(VMDriver *vm, // IN:
MPN mpn) // IN:
{
return PhysTrack_Test(vm->vmhost->lockedPages, mpn);
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_LockPage --
*
* Lockup the MPN of an pinned user-level address space
*
* Results:
* A PAGE_LOCK_* status code and the MPN on success.
*
* Side effects:
* Adds the page to the MemTracker, if allowMultipleMPNsPerVA then the page
* is added to the VM's PhysTracker.
*
*-----------------------------------------------------------------------------
*/
int
HostIF_LockPage(VMDriver *vm, // IN: VMDriver
VA64 uAddr, // IN: user VA of the page
Bool allowMultipleMPNsPerVA, // IN: allow to lock many pages per VA
MPN *mpn) // OUT: pinned page
{
void *uvAddr = VA64ToPtr(uAddr);
struct page *page;
VPN vpn;
MemTrackEntry *entryPtr = NULL;
vpn = PTR_2_VPN(uvAddr);
if (!allowMultipleMPNsPerVA) {
entryPtr = MemTrack_LookupVPN(vm->memtracker, vpn);
/*
* Already tracked and locked
*/
if (entryPtr != NULL && entryPtr->mpn != 0) {
return PAGE_LOCK_ALREADY_LOCKED;
}
}
if (HostIFGetUserPages(uvAddr, &page, 1)) {
return PAGE_LOCK_FAILED;
}
*mpn = (MPN)page_to_pfn(page);
if (allowMultipleMPNsPerVA) {
/*
* Add the MPN to the PhysTracker that tracks locked pages.
*/
struct PhysTracker* const pt = vm->vmhost->lockedPages;
if (PhysTrack_Test(pt, *mpn)) {
put_page(page);
return PAGE_LOCK_ALREADY_LOCKED;
}
PhysTrack_Add(pt, *mpn);
} else {
/*
* If the entry doesn't exist, add it to the memtracker
* otherwise we just update the mpn.
*/
if (entryPtr == NULL) {
entryPtr = MemTrack_Add(vm->memtracker, vpn, *mpn);
if (entryPtr == NULL) {
HOST_UNLOCK_PFN(vm, *mpn);
return PAGE_LOCK_MEMTRACKER_ERROR;
}
} else {
entryPtr->mpn = *mpn;
}
}
return PAGE_LOCK_SUCCESS;
}
/*
*----------------------------------------------------------------------
*
* HostIF_UnlockPage --
*
* Unlock an pinned user-level page.
*
* Results:
* Status PAGE_UNLOCK_* code.
*
* Side effects:
* None
*
*----------------------------------------------------------------------
*/
int
HostIF_UnlockPage(VMDriver *vm, // IN:
VA64 uAddr) // IN:
{
void *addr = VA64ToPtr(uAddr);
VPN vpn;
MemTrackEntry *e;
vpn = VA_2_VPN((VA)addr);
e = MemTrack_LookupVPN(vm->memtracker, vpn);
if (e == NULL) {
return PAGE_UNLOCK_NOT_TRACKED;
}
if (e->mpn == 0) {
return PAGE_UNLOCK_NO_MPN;
}
HOST_UNLOCK_PFN(vm, e->mpn);
e->mpn = 0;
return PAGE_UNLOCK_SUCCESS;
}
/*
*----------------------------------------------------------------------
*
* HostIF_UnlockPageByMPN --
*
* Unlock a locked user mode page. The page doesn't need to be mapped
* anywhere.
*
* Results:
* Status code. Returns a PAGE_LOOKUP_* error if the page can't be found or
* a PAGE_UNLOCK_* error if the page can't be unlocked.
*
* Side effects:
* Removes the MPN from from VM's PhysTracker.
*
*----------------------------------------------------------------------
*/
int
HostIF_UnlockPageByMPN(VMDriver *vm, // IN: VMDriver
MPN mpn, // IN: the MPN to unlock
VA64 uAddr) // IN: optional(debugging) VA for the MPN
{
if (!PhysTrack_Test(vm->vmhost->lockedPages, mpn)) {
return PAGE_UNLOCK_NO_MPN;
}
#ifdef VMX86_DEBUG
{
void *va = VA64ToPtr(uAddr);
MemTrackEntry *e;
/*
* Verify for debugging that VA and MPN make sense.
* PgtblVa2MPN() can fail under high memory pressure.
*/
if (va != NULL) {
MPN lookupMpn = PgtblVa2MPN((VA)va);
if (lookupMpn != INVALID_MPN && mpn != lookupMpn) {
Warning("Page lookup fail %#"FMT64"x %016" FMT64 "x %p\n",
mpn, lookupMpn, va);
return PAGE_LOOKUP_INVALID_ADDR;
}
}
/*
* Verify that this MPN was locked with
* HostIF_LockPage(allowMultipleMPNsPerVA = TRUE).
* That means that this MPN should not be in the MemTracker.
*/
e = MemTrack_LookupMPN(vm->memtracker, mpn);
if (e) {
Warning("%s(): mpn=%#"FMT64"x va=%p was permanently locked with "
"vpn=0x%"FMT64"x\n", __func__, mpn, va, e->vpn);
return PAGE_UNLOCK_MISMATCHED_TYPE;
}
}
#endif
HOST_UNLOCK_PFN_BYMPN(vm, mpn);
return PAGE_UNLOCK_SUCCESS;
}
static void
UnlockEntry(void *clientData, // IN:
MemTrackEntry *entryPtr) // IN:
{
VMDriver *vm = (VMDriver *)clientData;
if (entryPtr->mpn) {
HOST_UNLOCK_PFN(vm,entryPtr->mpn);
entryPtr->mpn = 0;
}
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_FreeAllResources --
*
* Free all host-specific VM resources.
*
* Results:
* None
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
void
HostIF_FreeAllResources(VMDriver *vm) // IN
{
unsigned int cnt;
HostIFHostMemCleanup(vm);
if (vm->memtracker) {
MemTrack_Cleanup(vm->memtracker, UnlockEntry, vm);
vm->memtracker = NULL;
}
if (vm->vmhost) {
for (cnt = vm->vmhost->crosspagePagesCount; cnt > 0; ) {
struct page* p = vm->vmhost->crosspagePages[--cnt];
UnmapCrossPage(p, vm->crosspage[cnt]);
}
vm->vmhost->crosspagePagesCount = 0;
if (vm->vmhost->hostAPICIsMapped) {
ASSERT(vm->hostAPIC.base != NULL);
iounmap((void*)vm->hostAPIC.base);
vm->hostAPIC.base = NULL;
vm->vmhost->hostAPICIsMapped = FALSE;
}
HostIF_FreeKernelMem(vm->vmhost);
vm->vmhost = NULL;
}
}
/*
*----------------------------------------------------------------------
*
* HostIF_AllocKernelMem
*
* Allocate some kernel memory for the driver.
*
* Results:
* The address allocated or NULL on error.
*
*
* Side effects:
* memory is malloced
*----------------------------------------------------------------------
*/
void *
HostIF_AllocKernelMem(size_t size, // IN:
int wired) // IN:
{
void * ptr = kmalloc(size, GFP_KERNEL);
if (ptr == NULL) {
Warning("%s failed (size=%p)\n", __func__, (void*)size);
}
return ptr;
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_AllocPage --
*
* Allocate a page (whose content is undetermined)
*
* Results:
* The kernel virtual address of the page
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
void *
HostIF_AllocPage(void)
{
VA kvAddr;
kvAddr = __get_free_page(GFP_KERNEL);
if (kvAddr == 0) {
Warning("%s: __get_free_page() failed\n", __func__);
}
return (void *)kvAddr;
}
/*
*----------------------------------------------------------------------
*
* HostIF_FreeKernelMem
*
* Free kernel memory allocated for the driver.
*
* Results:
* None.
*
* Side effects:
* memory is freed.
*----------------------------------------------------------------------
*/
void
HostIF_FreeKernelMem(void *ptr) // IN:
{
kfree(ptr);
}
void
HostIF_FreePage(void *ptr) // IN:
{
VA vAddr = (VA)ptr;
if (vAddr & (PAGE_SIZE-1)) {
Warning("%s %p misaligned\n", __func__, (void*)vAddr);
} else {
free_page(vAddr);
}
}
/*
*----------------------------------------------------------------------
*
* HostIF_EstimateLockedPageLimit --
*
* Estimates how many memory pages can be locked or allocated
* from the kernel without causing the host to die or to be really upset.
*
* Results:
* The maximum number of pages that can be locked.
*
* Side effects:
* none
*
*----------------------------------------------------------------------
*/
unsigned int
HostIF_EstimateLockedPageLimit(const VMDriver* vm, // IN
unsigned int currentlyLockedPages) // IN
{
/*
* This variable is available and exported to modules,
* since at least 2.6.0.
*/
extern unsigned long totalram_pages;
unsigned int totalPhysicalPages = totalram_pages;
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
return MemDefaults_CalcMaxLockedPages(totalPhysicalPages);
#else
/*
* Use the memory information linux exports as of late for a more
* precise estimate of locked memory. All kernel page-related structures
* (slab, pagetable) are as good as locked. Unevictable includes things
* that are explicitly marked as such (like mlock()). Huge pages are
* also as good as locked, since we don't use them. Lastly, without
* available swap, anonymous pages become locked in memory as well.
*/
unsigned int forHost;
unsigned int reservedPages = MEMDEFAULTS_MIN_HOST_PAGES;
unsigned int hugePages = (vm == NULL) ? 0 :
BYTES_2_PAGES(vm->memInfo.hugePageBytes);
unsigned int lockedPages = global_zone_page_state(NR_PAGETABLE) +
get_nr_slab_unreclaimable() +
get_nr_unevictable() +
hugePages + reservedPages;
unsigned int anonPages = get_nr_anon_mapped();
unsigned int swapPages = BYTES_2_PAGES(linuxState.swapSize);
if (anonPages > swapPages) {
lockedPages += anonPages - swapPages;
}
forHost = lockedPages + LOCKED_PAGE_SLACK;
if (forHost > totalPhysicalPages) {
forHost = totalPhysicalPages;
}
return totalPhysicalPages - forHost;
#endif
}
/*
*----------------------------------------------------------------------
*
* HostIF_Wait --
*
* Waits for specified number of milliseconds.
*
*----------------------------------------------------------------------
*/
void
HostIF_Wait(unsigned int timeoutMs)
{
msleep_interruptible(timeoutMs);
}
/*
*----------------------------------------------------------------------
*
* HostIF_WaitForFreePages --
*
* Waits for pages to be available for allocation or locking.
*
* Results:
* New pages are likely to be available for allocation or locking.
*
* Side effects:
* none
*
*----------------------------------------------------------------------
*/
void
HostIF_WaitForFreePages(unsigned int timeoutMs) // IN:
{
static unsigned count;
msleep_interruptible(timeoutMs);
count++;
}
/*
*----------------------------------------------------------------------
*
* HostIFReadUptimeWork --
*
* Reads the current uptime. The uptime is based on getimeofday,
* which provides the needed high resolution. However, we don't
* want uptime to be warped by e.g. calls to settimeofday. So, we
* use a jiffies based monotonic clock to sanity check the uptime.
* If the uptime is more than one second from the monotonic time,
* we assume that the time of day has been set, and recalculate the
* uptime base to get uptime back on track with monotonic time. On
* the other hand, we do expect jiffies based monotonic time and
* timeofday to have small drift (due to NTP rate correction, etc).
* We handle this by rebasing the jiffies based monotonic clock
* every second (see HostIFUptimeResyncMono).
*
* Results:
* The uptime, in units of UPTIME_FREQ. Also returns the jiffies
* value that was used in the monotonic time calculation.
*
* Side effects:
* May reset the uptime base in the case gettimeofday warp was
* detected.
*
*----------------------------------------------------------------------
*/
static uint64
HostIFReadUptimeWork(unsigned long *j) // OUT: current jiffies
{
struct timeval tv;
uint64 monotime, uptime, upBase, monoBase;
int64 diff;
uint32 version;
unsigned long jifs, jifBase;
unsigned int attempts = 0;
/* Assert that HostIF_InitUptime has been called. */
ASSERT(uptimeState.timer.function);
retry:
do {
version = VersionedAtomic_BeginTryRead(&uptimeState.version);
jifs = jiffies;
jifBase = uptimeState.jiffiesBase;
monoBase = uptimeState.monotimeBase;
} while (!VersionedAtomic_EndTryRead(&uptimeState.version, version));
do_gettimeofday(&tv);
upBase = Atomic_Read64(&uptimeState.uptimeBase);
monotime = (uint64)(jifs - jifBase) * (UPTIME_FREQ / HZ);
monotime += monoBase;
uptime = tv.tv_usec * (UPTIME_FREQ / 1000000) + tv.tv_sec * UPTIME_FREQ;
uptime += upBase;
/*
* Use the jiffies based monotonic time to sanity check gettimeofday.
* If they differ by more than one second, assume the time of day has
* been warped, and use the jiffies time to undo (most of) the warp.
*/
diff = uptime - monotime;
if (UNLIKELY(diff < -UPTIME_FREQ || diff > UPTIME_FREQ)) {
/* Compute a new uptimeBase to get uptime back on track. */
uint64 newUpBase = monotime - (uptime - upBase);
attempts++;
if (!Atomic_CMPXCHG64(&uptimeState.uptimeBase, &upBase, &newUpBase) &&
attempts < 5) {
/* Another thread updated uptimeBase. Recalculate uptime. */
goto retry;
}
uptime = monotime;
Log("%s: detected settimeofday: fixed uptimeBase old %"FMT64"u "
"new %"FMT64"u attempts %u\n", __func__,
upBase, newUpBase, attempts);
}
*j = jifs;
return uptime;
}
/*
*----------------------------------------------------------------------
*
* HostIFUptimeResyncMono --
*
* Timer that fires ever second to resynchronize the jiffies based
* monotonic time with the uptime.
*
* Results:
* None
*
* Side effects:
* Resets the monotonic time bases so that jiffies based monotonic
* time does not drift from gettimeofday over the long term.
*
*----------------------------------------------------------------------
*/
static void
HostIFUptimeResyncMono(compat_timer_arg_t unused) // IN: ignored
{
unsigned long jifs;
uintptr_t flags;
/*
* Read the uptime and the corresponding jiffies value. This will
* also correct the uptime (which is based on time of day) if needed
* before we rebase monotonic time (which is based on jiffies).
*/
uint64 uptime = HostIFReadUptimeWork(&jifs);
/*
* Every second, recalculate monoBase and jiffiesBase to squash small
* drift between gettimeofday and jiffies. Also, this prevents
* (jiffies - jiffiesBase) wrap on 32-bits.
*/
SAVE_FLAGS(flags);
CLEAR_INTERRUPTS();
VersionedAtomic_BeginWrite(&uptimeState.version);
uptimeState.monotimeBase = uptime;
uptimeState.jiffiesBase = jifs;
VersionedAtomic_EndWrite(&uptimeState.version);
RESTORE_FLAGS(flags);
/* Reschedule this timer to expire in one second. */
mod_timer(&uptimeState.timer, jifs + HZ);
}
/*
*----------------------------------------------------------------------
*
* HostIF_InitUptime --
*
* Initialize the uptime clock's state.
*
* Results:
* None
*
* Side effects:
* Sets the initial values for the uptime state, and schedules
* the uptime timer.
*
*----------------------------------------------------------------------
*/
void
HostIF_InitUptime(void)
{
struct timeval tv;
uptimeState.jiffiesBase = jiffies;
do_gettimeofday(&tv);
Atomic_Write64(&uptimeState.uptimeBase,
-(tv.tv_usec * (UPTIME_FREQ / 1000000) +
tv.tv_sec * UPTIME_FREQ));
timer_setup(&uptimeState.timer, HostIFUptimeResyncMono, 0);
mod_timer(&uptimeState.timer, jiffies + HZ);
}
/*
*----------------------------------------------------------------------
*
* HostIF_CleanupUptime --
*
* Cleanup uptime state, called at module unloading time.
*
* Results:
* None
*
* Side effects:
* Deschedule the uptime timer.
*
*----------------------------------------------------------------------
*/
void
HostIF_CleanupUptime(void)
{
del_timer_sync(&uptimeState.timer);
}
/*
*----------------------------------------------------------------------
*
* HostIF_ReadUptime --
*
* Read the system time. Returned value has no particular absolute
* value, only difference since previous call should be used.
*
* Results:
* Units are given by HostIF_UptimeFrequency.
*
* Side effects:
* See HostIFReadUptimeWork
*
*----------------------------------------------------------------------
*/
uint64
HostIF_ReadUptime(void)
{
unsigned long jifs;
return HostIFReadUptimeWork(&jifs);
}
/*
*----------------------------------------------------------------------
*
* HostIF_UptimeFrequency
*
* Return the frequency of the counter that HostIF_ReadUptime reads.
*
* Results:
* Frequency in Hz.
*
* Side effects:
* None
*
*----------------------------------------------------------------------
*/
uint64
HostIF_UptimeFrequency(void)
{
return UPTIME_FREQ;
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_CopyFromUser --
*
* Copy memory from the user application into a kernel buffer. This
* function may block, so don't call it while holding any kind of
* lock. --hpreg
*
* Results:
* 0 on success
* -EFAULT on failure.
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
int
HostIF_CopyFromUser(void *dst, // OUT
const void *src, // IN
unsigned int len) // IN
{
return copy_from_user(dst, src, len) ? -EFAULT : 0;
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_CopyToUser --
*
* Copy memory to the user application from a kernel buffer. This
* function may block, so don't call it while holding any kind of
* lock. --hpreg
*
* Results:
* 0 on success
* -EFAULT on failure.
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
int
HostIF_CopyToUser(void *dst, // OUT
const void *src, // IN
unsigned int len) // IN
{
return copy_to_user(dst, src, len) ? -EFAULT : 0;
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_MapCrossPage --
*
* Obtain kernel pointer to crosspage.
*
* We must return a VA that is obtained through a kernel mapping, so that
* the mapping never goes away (see bug 29753).
*
* However, the LA corresponding to that VA must not overlap with the
* monitor (see bug 32922). The userland code ensures that by only
* allocating cross pages from low memory. For those pages, the kernel
* uses a permanent mapping, instead of a temporary one with a high LA.
*
* Results:
* The kernel virtual address on success
* NULL on failure
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
void *
HostIF_MapCrossPage(VMDriver *vm, // IN
VA64 uAddr) // IN
{
void *p = VA64ToPtr(uAddr);
struct page *page;
VA vPgAddr;
VA ret;
if (HostIFGetUserPages(p, &page, 1)) {
return NULL;
}
vPgAddr = (VA) MapCrossPage(page);
HostIF_VMLock(vm, 27);
if (vm->vmhost->crosspagePagesCount >= MAX_INITBLOCK_CPUS) {
HostIF_VMUnlock(vm, 27);
UnmapCrossPage(page, (void*)vPgAddr);
return NULL;
}
vm->vmhost->crosspagePages[vm->vmhost->crosspagePagesCount++] = page;
HostIF_VMUnlock(vm, 27);
ret = vPgAddr | (((VA)p) & (PAGE_SIZE - 1));
return (void*)ret;
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_AllocCrossGDT --
*
* Allocate the per-vmmon cross GDT page set.
*
* See bora/doc/worldswitch-pages.txt for the requirements on the cross
* GDT page set addresses.
*
* Results:
* On success: Host kernel virtual address of the first cross GDT page.
* Use HostIF_FreeCrossGDT() with the same value to free.
* The 'crossGDTMPNs' array is filled with the MPNs of all the
* cross GDT pages.
* On failure: NULL.
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
void *
HostIF_AllocCrossGDT(uint32 numPages, // IN: Number of pages
MPN maxValidFirst, // IN: Highest valid MPN of first page
MPN *crossGDTMPNs) // OUT: Array of MPNs
{
MPN startMPN;
struct page *pages;
uint32 i;
void *crossGDT;
/*
* In practice, allocating a low page (MPN <= 0x100000 - 1) is equivalent to
* allocating a page with MPN <= 0xFEC00 - 1:
*
* o PC architecture guarantees that there is no RAM in top 16MB of 4GB
* range.
*
* o 0xFEC00000 is IOAPIC base. There could be RAM immediately below,
* but not above.
*
* How do we allocate a low page? We can safely use GFP_DMA32 when
* available. On 64bit kernels before GFP_DMA32 was introduced we
* fall back to DMA zone (which is not quite necessary for boxes
* with less than ~3GB of memory). On 32bit kernels we are using
* normal zone - which is usually 1GB, and at most 4GB (for 4GB/4GB
* kernels). And for 4GB/4GB kernels same restriction as for 64bit
* kernels applies - there is no RAM in top 16MB immediately below
* 4GB so alloc_pages() cannot return such page.
*/
ASSERT(0xFEC00 - 1 <= maxValidFirst);
for (i = 0; (1 << i) < numPages; i++) { }
#ifdef GFP_DMA32
pages = alloc_pages(GFP_KERNEL | GFP_DMA32, i);
#else
pages = alloc_pages(GFP_KERNEL | GFP_DMA, i);
#endif
crossGDT = NULL;
if (pages == NULL) {
Warning("%s: unable to alloc crossGDT (%u)\n", __func__, i);
} else {
startMPN = page_to_pfn(pages);
for (i = 0; i < numPages; i++) {
crossGDTMPNs[i] = startMPN + i;
}
crossGDT = (void *)page_address(pages);
}
return crossGDT;
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_FreeCrossGDT --
*
* Free the per-vmmon cross GDT page set allocated with
* HostIF_AllocCrossGDT().
*
* Results:
* None
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
void
HostIF_FreeCrossGDT(uint32 numPages, // IN: Number of pages
void *crossGDT) // IN: Kernel VA of first cross GDT page
{
uint32 i;
for (i = 0; (1 << i) < numPages; i++) { }
free_pages((VA)crossGDT, i);
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_VMLock --
*
* Grabs per-VM data structure lock. The lock is not recursive.
* The global lock has lower rank so the global lock should be grabbed
* first if both locks are acquired.
*
* It should be a medium contention lock. Also it should be fast:
* it is used for protecting of frequent page allocation and locking.
*
* Results:
* None
*
* Side effects:
* The current thread is rescheduled if the lock is busy.
*
*-----------------------------------------------------------------------------
*/
void
HostIF_VMLock(VMDriver *vm, // IN
int callerID) // IN
{
ASSERT(vm);
ASSERT(vm->vmhost);
MutexLock(&vm->vmhost->vmMutex, callerID);
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_VMUnlock --
*
* Releases per-VM data structure lock.
*
* Results:
* None
*
* Side effects:
* Can wake up the thread blocked on this lock.
*
*-----------------------------------------------------------------------------
*/
void
HostIF_VMUnlock(VMDriver *vm, // IN
int callerID) // IN
{
ASSERT(vm);
ASSERT(vm->vmhost);
MutexUnlock(&vm->vmhost->vmMutex, callerID);
}
#ifdef VMX86_DEBUG
/*
*-----------------------------------------------------------------------------
*
* HostIF_VMLockIsHeld --
*
* Determine if the per-VM lock is held by the current thread.
*
* Results:
* TRUE if yes
* FALSE if no
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
Bool
HostIF_VMLockIsHeld(VMDriver *vm) // IN
{
ASSERT(vm);
ASSERT(vm->vmhost);
return MutexIsLocked(&vm->vmhost->vmMutex);
}
#endif
/*
* Utility routines for accessing and enabling the APIC
*/
/*
* Defines for accessing the APIC. We use readl/writel to access the APIC
* which is how Linux wants you to access I/O memory (though on the x86
* just dereferencing a pointer works just fine).
*/
#define APICR_TO_ADDR(apic, reg) (apic + (reg << 4))
#define GET_APIC_REG(apic, reg) (readl(APICR_TO_ADDR(apic, reg)))
#define SET_APIC_REG(apic, reg, val) (writel(val, APICR_TO_ADDR(apic, reg)))
#define APIC_MAXLVT(apic) ((GET_APIC_REG(apic, APICR_VERSION) >> 16) & 0xff)
#define APIC_VERSIONREG(apic) (GET_APIC_REG(apic, APICR_VERSION) & 0xff)
#if defined(CONFIG_SMP) || defined(CONFIG_X86_UP_IOAPIC) || \
defined(CONFIG_X86_UP_APIC) || defined(CONFIG_X86_LOCAL_APIC)
/*
*----------------------------------------------------------------------
*
* isVAReadable --
*
* Verify that passed VA is accessible without crash...
*
* Results:
* TRUE if address is readable, FALSE otherwise.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
static Bool
isVAReadable(VA r) // IN:
{
mm_segment_t old_fs;
uint32 dummy;
int ret;
old_fs = get_fs();
set_fs(get_ds());
r = APICR_TO_ADDR(r, APICR_VERSION);
ret = HostIF_CopyFromUser(&dummy, (void*)r, sizeof(dummy));
set_fs(old_fs);
return ret == 0;
}
/*
*----------------------------------------------------------------------
*
* SetVMAPICAddr --
*
* Maps the host cpu's APIC. The virtual address is stashed in
* the VMDriver structure.
*
* Results:
* None.
*
* Side effects:
* The VMDriver structure is updated.
*
*----------------------------------------------------------------------
*/
static void
SetVMAPICAddr(VMDriver *vm, // IN/OUT: driver state
MA ma) // IN: host APIC's ma
{
volatile void *hostapic;
ASSERT_ON_COMPILE(APICR_SIZE <= PAGE_SIZE);
hostapic = (volatile void *) ioremap_nocache(ma, PAGE_SIZE);
if (hostapic) {
if ((APIC_VERSIONREG(hostapic) & 0xF0) == 0x10) {
vm->hostAPIC.base = (volatile uint32 (*)[4]) hostapic;
ASSERT(vm->vmhost != NULL);
vm->vmhost->hostAPICIsMapped = TRUE;
} else {
iounmap((void*)hostapic);
}
}
}
/*
*----------------------------------------------------------------------
*
* ProbeAPIC --
*
* Attempts to map the host APIC.
*
* Most versions of Linux already provide access to a mapped
* APIC. This function is just a backup.
*
* Caveat: We assume that the APIC physical address is the same
* on all host cpus.
*
* Results:
* TRUE if APIC was found, FALSE if not.
*
* Side effects:
* May map the APIC.
*
*----------------------------------------------------------------------
*/
static Bool
ProbeAPIC(VMDriver *vm, // IN/OUT: driver state
Bool setVMPtr) // IN: set a pointer to the APIC's virtual address
{
MA ma = APIC_GetMA();
if (ma == (MA)-1) {
return FALSE;
}
if (setVMPtr) {
SetVMAPICAddr(vm, ma);
} else {
vm->hostAPIC.base = NULL;
}
return TRUE;
}
#endif
/*
*----------------------------------------------------------------------
*
* HostIF_APICInit --
*
* Initialize APIC behavior.
* Attempts to map the host APIC into vm->hostAPIC.
*
* We don't attempt to refresh the mapping after a host cpu
* migration. Fortunately, hosts tend to use the same address
* for all APICs.
*
* Most versions of Linux already provide a mapped APIC. We
* have backup code to read APIC_BASE and map it, if needed.
*
* Results:
* TRUE
*
* Side effects:
* May map the host APIC.
*
*----------------------------------------------------------------------
*/
Bool
HostIF_APICInit(VMDriver *vm, // IN:
Bool setVMPtr, // IN:
Bool probe) // IN: force probing
{
#if defined(CONFIG_SMP) || defined(CONFIG_X86_UP_IOAPIC) || \
defined(CONFIG_X86_UP_APIC) || defined(CONFIG_X86_LOCAL_APIC)
static Bool apicIPILogged = FALSE;
VA kAddr;
monitorIPIVector = SPURIOUS_APIC_VECTOR;
#if defined(POSTED_INTR_VECTOR)
hvIPIVector = POSTED_INTR_VECTOR;
#else
hvIPIVector = 0;
#endif
if (!apicIPILogged) {
Log("Monitor IPI vector: %x\n", monitorIPIVector);
Log("HV IPI vector: %x\n", hvIPIVector);
apicIPILogged = TRUE;
}
if ((__GET_MSR(MSR_APIC_BASE) & APIC_MSR_X2APIC_ENABLED) != 0) {
if (setVMPtr) {
vm->hostAPIC.base = NULL;
vm->vmhost->hostAPICIsMapped = FALSE;
vm->hostAPIC.isX2 = TRUE;
}
return TRUE;
}
if (probe && ProbeAPIC(vm, setVMPtr)) {
return TRUE;
}
/*
* Normal case: use Linux's pre-mapped APIC.
*/
kAddr = __fix_to_virt(FIX_APIC_BASE);
if (!isVAReadable(kAddr)) {
return TRUE;
}
if (setVMPtr) {
vm->hostAPIC.base = (void *)kAddr;
} else {
vm->hostAPIC.base = NULL;
}
#endif
return TRUE;
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_SemaphoreWait --
*
* Perform the semaphore wait (P) operation, possibly blocking.
*
* Result:
* 1 (which equals MX_WAITNORMAL) if success,
* negated error code otherwise.
*
* Side-effects:
* None
*
*-----------------------------------------------------------------------------
*/
int
HostIF_SemaphoreWait(VMDriver *vm, // IN:
Vcpuid vcpuid, // IN:
uint64 *args) // IN:
{
struct file *file;
mm_segment_t old_fs;
int res;
int waitFD = args[0];
int timeoutms = args[2];
uint64 value;
file = vmware_fget(waitFD);
if (file == NULL) {
return MX_WAITERROR;
}
old_fs = get_fs();
set_fs(get_ds());
{
struct poll_wqueues table;
unsigned int mask;
poll_initwait(&table);
current->state = TASK_INTERRUPTIBLE;
mask = file->f_op->poll(file, &table.pt);
if (!(mask & (POLLIN | POLLERR | POLLHUP))) {
vm->vmhost->vcpuSemaTask[vcpuid] = current;
schedule_timeout(timeoutms * HZ / 1000); // convert to Hz
vm->vmhost->vcpuSemaTask[vcpuid] = NULL;
}
current->state = TASK_RUNNING;
poll_freewait(&table);
}
/*
* Userland only writes in multiples of sizeof(uint64). This will allow
* the code to happily deal with a pipe or an eventfd. We only care about
* reading no bytes (EAGAIN - non blocking fd) or sizeof(uint64).
*/
res = file->f_op->read(file, (char *) &value, sizeof value, &file->f_pos);
if (res == sizeof value) {
res = MX_WAITNORMAL;
} else {
if (res == 0) {
res = -EBADF;
}
}
set_fs(old_fs);
fput(file);
/*
* Handle benign errors:
* EAGAIN is MX_WAITTIMEDOUT.
* The signal-related errors are all mapped into MX_WAITINTERRUPTED.
*/
switch (res) {
case -EAGAIN:
res = MX_WAITTIMEDOUT;
break;
case -EINTR:
case -ERESTART:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
case -ERESTARTNOHAND:
res = MX_WAITINTERRUPTED;
break;
case -EBADF:
res = MX_WAITERROR;
break;
}
return res;
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_SemaphoreForceWakeup --
*
* For each VCPU in the set whose target process is lightly sleeping (i.e.
* TASK_INTERRUPTIBLE), wake it up. The target process can be waiting on a
* semaphore or due to a call to Vmx86_YieldToSet.
*
* Result:
* None.
*
* Side-effects:
* None
*
*-----------------------------------------------------------------------------
*/
void
HostIF_SemaphoreForceWakeup(VMDriver *vm, // IN:
const VCPUSet *vcs) // IN:
{
FOR_EACH_VCPU_IN_SET(vcs, vcpuid) {
struct task_struct *t = vm->vmhost->vcpuSemaTask[vcpuid];
vm->vmhost->vcpuSemaTask[vcpuid] = NULL;
if (t && (t->state & TASK_INTERRUPTIBLE)) {
wake_up_process(t);
}
} ROF_EACH_VCPU_IN_SET();
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_SemaphoreSignal --
*
* Perform the semaphore signal (V) operation.
*
* Result:
* On success: MX_WAITNORMAL (1).
* On error: MX_WAITINTERRUPTED (3) if interrupted by a Unix signal (we
* can block on a preemptive kernel).
* MX_WAITERROR (0) on generic error.
* Negated system error (< 0).
*
* Side-effects:
* None
*
*-----------------------------------------------------------------------------
*/
int
HostIF_SemaphoreSignal(uint64 *args) // IN:
{
struct file *file;
mm_segment_t old_fs;
int res;
int signalFD = args[1];
uint64 value = 1; // make an eventfd happy should it be there
file = vmware_fget(signalFD);
if (!file) {
return MX_WAITERROR;
}
old_fs = get_fs();
set_fs(get_ds());
/*
* Always write sizeof(uint64) bytes. This works fine for eventfd and
* pipes. The data written is formatted to make an eventfd happy should
* it be present.
*/
res = file->f_op->write(file, (char *) &value, sizeof value, &file->f_pos);
if (res == sizeof value) {
res = MX_WAITNORMAL;
}
set_fs(old_fs);
fput(file);
/*
* Handle benign errors:
* EAGAIN is MX_WAITTIMEDOUT.
* The signal-related errors are all mapped into MX_WAITINTERRUPTED.
*/
switch (res) {
case -EAGAIN:
// The pipe is full, so it is already signalled. Success.
res = MX_WAITNORMAL;
break;
case -EINTR:
case -ERESTART:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
case -ERESTARTNOHAND:
res = MX_WAITINTERRUPTED;
break;
}
return res;
}
#if ((LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)) || !defined(CONFIG_SMP))
# define VMMON_USE_CALL_FUNC
#endif
#if defined(VMMON_USE_CALL_FUNC)
/*
*----------------------------------------------------------------------
*
* LinuxDriverIPIHandler --
*
* Null IPI handler - for monitor to notice AIO completion
*
*----------------------------------------------------------------------
*/
void
LinuxDriverIPIHandler(void *info)
{
return;
}
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 17)
#define VMMON_CALL_FUNC_SYNC 0 // async; we've not seen any problems
#else
#define VMMON_CALL_FUNC_SYNC 1 // sync; insure no problems from old releases
#endif
#endif
/*
*----------------------------------------------------------------------
*
* HostIF_IPI --
*
* If the passed VCPU threads are on some CPUs in the system,
* attempt to hit them with an IPI.
*
* On older Linux systems we do a broadcast.
*
* Result:
* The mode used to send IPIs.
*
*----------------------------------------------------------------------
*/
HostIFIPIMode
HostIF_IPI(VMDriver *vm, // IN:
const VCPUSet *ipiTargets) // IN:
{
HostIFIPIMode mode = IPI_NONE;
ASSERT(vm);
FOR_EACH_VCPU_IN_SET(ipiTargets, v) {
uint32 targetHostCpu = vm->currentHostCpu[v];
if (targetHostCpu != INVALID_PCPU) {
ASSERT(targetHostCpu < MAX_PCPUS);
#if defined(VMMON_USE_CALL_FUNC)
/* older kernels IPI broadcast; use async when possible */
(void) compat_smp_call_function(LinuxDriverIPIHandler,
NULL, VMMON_CALL_FUNC_SYNC);
mode = IPI_BROADCAST;
break;
#else
/* Newer kernels have (async) IPI targetting */
arch_send_call_function_single_ipi(targetHostCpu);
mode = IPI_UNICAST;
#endif
}
} ROF_EACH_VCPU_IN_SET();
return mode;
}
typedef struct {
Atomic_uint32 index;
CPUIDQuery *query;
} HostIFGetCpuInfoData;
/*
*-----------------------------------------------------------------------------
*
* HostIFGetCpuInfo --
*
* Collect CPUID information on the current logical CPU.
*
* Results:
* None.
*
* Side effects:
* 'data->index' is atomically incremented by one.
*
*-----------------------------------------------------------------------------
*/
static void
HostIFGetCpuInfo(void *clientData) // IN/OUT: A HostIFGetCpuInfoData *
{
HostIFGetCpuInfoData *data = (HostIFGetCpuInfoData *)clientData;
CPUIDQuery *query;
uint32 index;
ASSERT(data);
query = data->query;
ASSERT(query);
index = Atomic_ReadInc32(&data->index);
if (index >= query->numLogicalCPUs) {
return;
}
query->logicalCPUs[index].tag = HostIF_GetCurrentPCPU();
__GET_CPUID2(query->eax, query->ecx, &query->logicalCPUs[index].regs);
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_GetAllCpuInfo --
*
* Collect CPUID information on all logical CPUs.
*
* 'query->numLogicalCPUs' is the size of the 'query->logicalCPUs' output
* array.
*
* Results:
* On success: TRUE. 'query->logicalCPUs' is filled and
* 'query->numLogicalCPUs' is adjusted accordingly.
* On failure: FALSE. Happens if 'query->numLogicalCPUs' was too small.
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
Bool
HostIF_GetAllCpuInfo(CPUIDQuery *query) // IN/OUT
{
HostIFGetCpuInfoData data;
Atomic_Write32(&data.index, 0);
data.query = query;
/*
* XXX Linux has userland APIs to bind a thread to a processor, so we could
* probably implement this in userland like we do on Win32.
*/
HostIF_CallOnEachCPU(HostIFGetCpuInfo, &data);
/*
* At this point, Atomic_Read32(&data.index) is the number of logical CPUs
* who replied.
*/
if (Atomic_Read32(&data.index) > query->numLogicalCPUs) {
return FALSE;
}
ASSERT(Atomic_Read32(&data.index) <= query->numLogicalCPUs);
query->numLogicalCPUs = Atomic_Read32(&data.index);
return TRUE;
}
/*
*----------------------------------------------------------------------
*
* HostIF_CallOnEachCPU --
*
* Call specified function once on each CPU. No ordering guarantees.
*
* Results:
* None.
*
* Side effects:
* None. May be slow.
*
*----------------------------------------------------------------------
*/
void
HostIF_CallOnEachCPU(void (*func)(void*), // IN: function to call
void *data) // IN/OUT: argument to function
{
preempt_disable();
(*func)(data);
(void)compat_smp_call_function(*func, data, 1);
preempt_enable();
}
/*
*-----------------------------------------------------------------------------
*
* HostIFCheckTrackedMPN --
*
* Check if a given MPN is tracked for the specified VM.
*
* Result:
* TRUE if the MPN is tracked in one of the trackers for the specified VM,
* FALSE otherwise.
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
Bool
HostIFCheckTrackedMPN(VMDriver *vm, // IN: The VM instance
MPN mpn) // IN: The MPN
{
VMHost * const vmh = vm->vmhost;
if (vmh == NULL) {
return FALSE;
}
HostIF_VMLock(vm, 32); // Debug version of PhysTrack wants VM's lock.
if (vmh->lockedPages) {
if (PhysTrack_Test(vmh->lockedPages, mpn)) {
HostIF_VMUnlock(vm, 32);
return TRUE;
}
}
if (vmh->AWEPages) {
if (PhysTrack_Test(vmh->AWEPages, mpn)) {
HostIF_VMUnlock(vm, 32);
return TRUE;
}
}
if (vm->memtracker) {
if (MemTrack_LookupMPN(vm->memtracker, mpn) != NULL) {
HostIF_VMUnlock(vm, 32);
return TRUE;
}
}
HostIF_VMUnlock(vm, 32);
if (vmx86_debug) {
/*
* The monitor may have old KSeg mappings to pages which it no longer
* owns. Minimize customer noise by only logging this for developers.
*/
Log("%s: MPN %" FMT64 "x not owned by this VM\n", __FUNCTION__, mpn);
}
return FALSE;
}
/*
*----------------------------------------------------------------------
*
* HostIF_ReadPage --
*
* Reads one page of data from a machine page and returns it in the
* specified kernel or user buffer. The machine page must be owned by
* the specified VM.
*
* Results:
* 0 on success
* negative error code on error
*
* Side effects:
* none
*
*----------------------------------------------------------------------
*/
int
HostIF_ReadPage(VMDriver *vm, // IN: The VM instance
MPN mpn, // MPN of the page
VA64 addr, // buffer for data
Bool kernelBuffer) // is the buffer in kernel space?
{
void *buf = VA64ToPtr(addr);
int ret = 0;
const void* ptr;
struct page* page;
if (mpn == INVALID_MPN) {
return -EFAULT;
}
if (HostIFCheckTrackedMPN(vm, mpn) == FALSE) {
return -EFAULT;
}
page = pfn_to_page(mpn);
ptr = kmap(page);
if (ptr == NULL) {
return -ENOMEM;
}
if (kernelBuffer) {
memcpy(buf, ptr, PAGE_SIZE);
} else {
ret = HostIF_CopyToUser(buf, ptr, PAGE_SIZE);
}
kunmap(page);
return ret;
}
/*
*----------------------------------------------------------------------
*
* HostIF_WritePage --
*
* Writes one page of data from a kernel or user buffer onto the specified
* machine page. The machine page must be owned by the specified VM.
*
* Results:
* 0 on success
* negative error code on error
*
* Side effects:
* none
*
*----------------------------------------------------------------------
*/
int
HostIFWritePageWork(MPN mpn, // MPN of the page
VA64 addr, // data to write to the page
Bool kernelBuffer) // is the buffer in kernel space?
{
void const *buf = VA64ToPtr(addr);
int ret = 0;
void* ptr;
struct page* page;
if (mpn == INVALID_MPN) {
return -EFAULT;
}
page = pfn_to_page(mpn);
ptr = kmap(page);
if (ptr == NULL) {
return -ENOMEM;
}
if (kernelBuffer) {
memcpy(ptr, buf, PAGE_SIZE);
} else {
ret = HostIF_CopyFromUser(ptr, buf, PAGE_SIZE);
}
kunmap(page);
return ret;
}
int
HostIF_WritePage(VMDriver *vm, // IN: The VM instance
MPN mpn, // MPN of the page
VA64 addr, // data to write to the page
Bool kernelBuffer) // is the buffer in kernel space?
{
if (HostIFCheckTrackedMPN(vm, mpn) == FALSE) {
return -EFAULT;
}
return HostIFWritePageWork(mpn, addr, kernelBuffer);
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_WriteMachinePage --
*
* Puts the content of a machine page into a kernel or user mode
* buffer. This should only be used for host-global pages, not any
* VM-owned pages.
*
* Results:
* On success: 0
* On failure: a negative error code
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
int
HostIF_WriteMachinePage(MPN mpn, // IN: MPN of the page
VA64 addr) // IN: data to write to the page
{
return HostIFWritePageWork(mpn, addr, TRUE);
}
/*
*----------------------------------------------------------------------
*
* HostIF_GetLockedPageList --
*
* puts MPNs of pages that were allocated by HostIF_AllocLockedPages()
* into user mode buffer.
*
* Results:
* non-negative number of the MPNs in the buffer on success.
* negative error code on error (-EFAULT)
*
* Side effects:
* none
*
*----------------------------------------------------------------------
*/
int
HostIF_GetLockedPageList(VMDriver* vm, // IN: VM instance pointer
VA64 uAddr, // OUT: user mode buffer for MPNs
unsigned int numPages) // IN: size of the buffer in MPNs
{
MPN *mpns = VA64ToPtr(uAddr);
MPN mpn;
unsigned count;
struct PhysTracker* AWEPages;
if (!vm->vmhost || !vm->vmhost->AWEPages) {
return 0;
}
AWEPages = vm->vmhost->AWEPages;
for (mpn = 0, count = 0;
(count < numPages) &&
(INVALID_MPN != (mpn = PhysTrack_GetNext(AWEPages, mpn)));
count++) {
if (HostIF_CopyToUser(&mpns[count], &mpn, sizeof *mpns) != 0) {
return -EFAULT;
}
}
return count;
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_GetNextAnonPage --
*
* If "inMPN" is INVALID_MPN gets the first MPN in the anon mpn list else
* gets the anon mpn after "inMPN" in the anon mpn list.
*
* Results:
* Next anon MPN. If the list has been exhausted, returns INVALID_MPN.
*
*-----------------------------------------------------------------------------
*/
MPN
HostIF_GetNextAnonPage(VMDriver *vm, MPN inMPN)
{
if (!vm->vmhost || !vm->vmhost->AWEPages) {
return INVALID_MPN;
}
return PhysTrack_GetNext(vm->vmhost->AWEPages, inMPN);
}
/*
*----------------------------------------------------------------------
*
* HostIF_GetCurrentPCPU --
*
* Get current physical CPU id. Interrupts should be disabled so
* that the thread cannot move to another CPU.
*
* Results:
* Host CPU number.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
uint32
HostIF_GetCurrentPCPU(void)
{
return smp_processor_id();
}
#ifdef VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT
/*
*----------------------------------------------------------------------
*
* HostIFWakeupClockThread --
*
* Wake up the fast clock thread. Can't do this from the timer
* callback, because it holds locks that the scheduling code
* might take.
*
* Results:
* None.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
static void
HostIFWakeupClockThread(unsigned long data) //IN:
{
wake_up_process(linuxState.fastClockThread);
}
/*
*----------------------------------------------------------------------
*
* HostIFTimerCallback --
*
* Schedule a tasklet to wake up the fast clock thread.
*
* Results:
* Tell the kernel not to restart the timer.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
static enum hrtimer_restart
HostIFTimerCallback(struct hrtimer *timer) //IN:
{
tasklet_schedule(&timerTasklet);
return HRTIMER_NORESTART;
}
/*
*----------------------------------------------------------------------
*
* HostIFScheduleHRTimeout --
*
* Schedule an hrtimer to wake up the fast clock thread.
*
* Results:
* None.
*
* Side effects:
* Sleep.
*
*----------------------------------------------------------------------
*/
static void
HostIFScheduleHRTimeout(ktime_t *expires) //IN:
{
struct hrtimer t;
if (expires && !expires->tv64) {
__set_current_state(TASK_RUNNING);
return;
}
hrtimer_init(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
t.function = HostIFTimerCallback;
hrtimer_start(&t, *expires, HRTIMER_MODE_REL);
if (hrtimer_active(&t)) {
schedule();
}
hrtimer_cancel(&t);
__set_current_state(TASK_RUNNING);
}
#endif //VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT
#ifndef VMMON_USE_HIGH_RES_TIMERS
/*
*----------------------------------------------------------------------
*
* HostIFDoIoctl --
*
* Issue ioctl. Assume kernel is not locked. It is not true now,
* but it makes things easier to understand, and won't surprise us
* later when we get rid of kernel lock from our code.
*
* Results:
* Same as ioctl method.
*
* Side effects:
* none.
*
*----------------------------------------------------------------------
*/
static long
HostIFDoIoctl(struct file *filp,
u_int iocmd,
unsigned long ioarg)
{
if (filp->f_op->unlocked_ioctl) {
return filp->f_op->unlocked_ioctl(filp, iocmd, ioarg);
}
return -ENOIOCTLCMD;
}
#endif //VMON_USE_HIGH_RES_TIMERS
/*
*----------------------------------------------------------------------
*
* HostIFStartTimer --
*
* Starts the timer using either /dev/rtc or high-resolution timers.
*
* Results:
* Returns 0 on success, -1 on failure.
*
* Side effects:
* Sleep until timer expires.
*
*----------------------------------------------------------------------
*/
int
HostIFStartTimer(Bool rateChanged, //IN: Did rate change?
unsigned int rate, //IN: current clock rate
struct file *filp) //IN: /dev/rtc descriptor
{
#ifdef VMMON_USE_HIGH_RES_TIMERS
static unsigned long slack = 0;
static ktime_t expires;
int timerPeriod;
if (rateChanged) {
timerPeriod = NSEC_PER_SEC / rate;
expires = ktime_set(0, timerPeriod);
/*
* Allow the kernel to expire the timer at its convenience.
* ppoll() uses 0.1% of the timeout value. I think we can
* tolerate 1%.
*/
slack = timerPeriod / 100;
}
set_current_state(TASK_INTERRUPTIBLE);
# ifdef VMMON_USE_SCHEDULE_HRTIMEOUT
schedule_hrtimeout_range(&expires, slack, HRTIMER_MODE_REL);
# else
HostIFScheduleHRTimeout(&expires);
# endif
#else
unsigned p2rate;
int res;
unsigned long buf;
loff_t pos = 0;
if (rateChanged) {
/*
* The host will already have HZ timer interrupts per second. So
* in order to satisfy the requested rate, we need up to (rate -
* HZ) additional interrupts generated by the RTC. That way, if
* the guest ask for a bit more than 1024 virtual interrupts per
* second (which is a common case for Windows with multimedia
* timers), we'll program the RTC to 1024 rather than 2048, which
* saves a considerable amount of CPU. PR 519228.
*/
if (rate > HZ) {
rate -= HZ;
} else {
rate = 0;
}
/*
* Don't set the RTC rate to 64 Hz or lower: some kernels have a
* bug in the HPET emulation of RTC that will cause the RTC
* frequency to get stuck at 64Hz. See PR 519228 comment #23.
*/
p2rate = 128;
// Hardware rate must be a power of 2
while (p2rate < rate && p2rate < 8192) {
p2rate <<= 1;
}
res = HostIFDoIoctl(filp, RTC_IRQP_SET, p2rate);
if (res < 0) {
Warning("/dev/rtc set rate %d failed: %d\n", p2rate, res);
return -1;
}
if (kthread_should_stop()) {
return -1;
}
}
res = filp->f_op->read(filp, (void *) &buf, sizeof(buf), &pos);
if (res <= 0) {
if (res != -ERESTARTSYS) {
Log("/dev/rtc read failed: %d\n", res);
}
return -1;
}
#endif
return 0;
}
/*
*----------------------------------------------------------------------
*
* HostIFFastClockThread --
*
* Kernel thread that provides finer-grained wakeups than the
* main system timers by using /dev/rtc. We can't do this at
* user level because /dev/rtc is not sharable (PR 19266). Also,
* we want to avoid the overhead of a context switch out to user
* level on every RTC interrupt.
*
* Results:
* Returns 0.
*
* Side effects:
* Wakeups and IPIs.
*
*----------------------------------------------------------------------
*/
static int
HostIFFastClockThread(void *data) // IN:
{
struct file *filp = (struct file *) data;
int res;
mm_segment_t oldFS;
unsigned int rate = 0;
unsigned int prevRate = 0;
oldFS = get_fs();
set_fs(KERNEL_DS);
allow_signal(SIGKILL);
set_user_nice(current, linuxState.fastClockPriority);
while ((rate = linuxState.fastClockRate) > MIN_RATE) {
if (kthread_should_stop()) {
goto out;
}
res = HostIFStartTimer(rate != prevRate, rate, filp);
if (res < 0) {
goto out;
}
prevRate = rate;
#if defined(CONFIG_SMP)
/*
* IPI each VCPU thread that is in the monitor and is due to
* fire a MonTimer callback.
*/
Vmx86_MonTimerIPI();
#endif
/*
* Wake threads that are waiting for a fast poll timeout at
* userlevel. This is needed only on Linux. On Windows,
* we get shorter timeouts simply by increasing the host
* clock rate.
*/
LinuxDriverWakeUp(TRUE);
}
out:
LinuxDriverWakeUp(TRUE);
set_fs(oldFS);
/*
* Do not exit thread until we are told to do so.
*/
do {
set_current_state(TASK_UNINTERRUPTIBLE);
if (kthread_should_stop()) {
break;
}
schedule();
} while (1);
set_current_state(TASK_RUNNING);
return 0;
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_SetFastClockRate --
*
* The monitor wants to poll for events at the given rate.
* Ensure that the host OS's timer interrupts come at least at
* this rate. If the requested rate is greater than the rate at
* which timer interrupts will occur on CPUs other than 0, then
* also arrange to call Vmx86_MonitorPollIPI on every timer
* interrupt, in order to relay IPIs to any other CPUs that need
* them.
*
* Locking:
* The caller must hold the fast clock lock.
*
* Results:
* 0 for success; positive error code if /dev/rtc could not be opened.
*
* Side effects:
* As described above.
*
*-----------------------------------------------------------------------------
*/
int
HostIF_SetFastClockRate(unsigned int rate) // IN: Frequency in Hz.
{
ASSERT(MutexIsLocked(&fastClockMutex));
linuxState.fastClockRate = rate;
/*
* Overview
* --------
* An SMP Linux kernel programs the 8253 timer (to increment the 'jiffies'
* counter) _and_ all local APICs (to run the scheduler code) to deliver
* interrupts HZ times a second.
*
* Time
* ----
* The kernel tries very hard to spread all these interrupts evenly over
* time, i.e. on a 1 CPU system, the 1 local APIC phase is shifted by 1/2
* period compared to the 8253, and on a 2 CPU system, the 2 local APIC
* phases are respectively shifted by 1/3 and 2/3 period compared to the
* 8253. This is done to reduce contention on locks guarding the global task
* queue.
*
* Space
* -----
* The 8253 interrupts are distributed between physical CPUs, evenly on a P3
* system, whereas on a P4 system physical CPU 0 gets all of them.
*
* Long story short, unless the monitor requested rate is significantly
* higher than HZ, we don't need to send IPIs or exclusively grab /dev/rtc
* to periodically kick vCPU threads running in the monitor on all physical
* CPUs.
*/
if (rate > MIN_RATE) {
if (!linuxState.fastClockThread) {
struct task_struct *rtcTask;
struct file *filp = NULL;
#if !defined(VMMON_USE_HIGH_RES_TIMERS)
int res;
filp = filp_open("/dev/rtc", O_RDONLY, 0);
if (IS_ERR(filp)) {
Warning("/dev/rtc open failed: %d\n", (int)(VA)filp);
return -(int)(VA)filp;
}
res = HostIFDoIoctl(filp, RTC_PIE_ON, 0);
if (res < 0) {
Warning("/dev/rtc enable interrupt failed: %d\n", res);
filp_close(filp, current->files);
return -res;
}
#endif
rtcTask = kthread_run(HostIFFastClockThread, filp, "vmware-rtc");
if (IS_ERR(rtcTask)) {
long err = PTR_ERR(rtcTask);
/*
* Ignore ERESTARTNOINTR silently, it occurs when signal is
* pending, and syscall layer automatically reissues operation
* after signal is handled.
*/
if (err != -ERESTARTNOINTR) {
Warning("/dev/rtc cannot start watch thread: %ld\n", err);
}
close_rtc(filp, current->files);
return -err;
}
linuxState.fastClockThread = rtcTask;
linuxState.fastClockFile = filp;
}
} else {
if (linuxState.fastClockThread) {
force_sig(SIGKILL, linuxState.fastClockThread);
kthread_stop(linuxState.fastClockThread);
close_rtc(linuxState.fastClockFile, current->files);
linuxState.fastClockThread = NULL;
linuxState.fastClockFile = NULL;
}
}
return 0;
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_MapUserMem --
*
* Obtain kernel pointer to user memory. The pages backing the user memory
* address are locked into memory (this allows the pointer to be used in
* contexts where paging is undesirable or impossible).
*
* Results:
* On success, returns the kernel virtual address, along with a handle to
* be used for unmapping.
* On failure, returns NULL.
*
* Side effects:
* Yes.
*
*-----------------------------------------------------------------------------
*/
void *
HostIF_MapUserMem(VA addr, // IN: User memory virtual address
size_t size, // IN: Size of memory desired
VMMappedUserMem **handle) // OUT: Handle to mapped memory
{
void *p = (void *) (uintptr_t) addr;
VMMappedUserMem *newHandle;
VA offset = addr & (PAGE_SIZE - 1);
size_t numPagesNeeded = ((offset + size) / PAGE_SIZE) + 1;
size_t handleSize =
sizeof *newHandle + numPagesNeeded * sizeof newHandle->pages[0];
void *mappedAddr;
ASSERT(handle);
if (!access_ok(VERIFY_WRITE, p, size)) {
printk(KERN_ERR "%s: Couldn't verify write to uva 0x%p with size %"
FMTSZ"u\n", __func__, p, size);
return NULL;
}
newHandle = kmalloc(handleSize, GFP_KERNEL);
if (newHandle == NULL) {
printk(KERN_ERR "%s: Couldn't allocate %"FMTSZ"u bytes of memory\n",
__func__, handleSize);
return NULL;
}
if (HostIFGetUserPages(p, newHandle->pages, numPagesNeeded)) {
kfree(newHandle);
printk(KERN_ERR "%s: Couldn't get %"FMTSZ"u %s for uva 0x%p\n", __func__,
numPagesNeeded, numPagesNeeded > 1 ? "pages" : "page", p);
return NULL;
}
if (numPagesNeeded > 1) {
/*
* Unlike kmap(), vmap() can fail. If it does, we need to release the
* pages that we acquired in HostIFGetUserPages().
*/
mappedAddr = vmap(newHandle->pages, numPagesNeeded, VM_MAP, PAGE_KERNEL);
if (mappedAddr == NULL) {
unsigned int i;
for (i = 0; i < numPagesNeeded; i++) {
put_page(newHandle->pages[i]);
}
kfree(newHandle);
printk(KERN_ERR "%s: Couldn't vmap %"FMTSZ"u %s for uva 0x%p\n",
__func__, numPagesNeeded,
numPagesNeeded > 1 ? "pages" : "page", p);
return NULL;
}
} else {
mappedAddr = kmap(newHandle->pages[0]);
}
printk(KERN_DEBUG "%s: p = 0x%p, offset = 0x%p, numPagesNeeded = %"FMTSZ"u,"
" handleSize = %"FMTSZ"u, mappedAddr = 0x%p\n",
__func__, p, (void *)offset, numPagesNeeded, handleSize, mappedAddr);
newHandle->numPages = numPagesNeeded;
newHandle->addr = mappedAddr;
*handle = newHandle;
return mappedAddr + offset;
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_UnmapUserMem --
*
* Unmap user memory from HostIF_MapUserMem().
*
* Results:
* None.
*
* Side effects:
* Yes.
*
*-----------------------------------------------------------------------------
*/
void
HostIF_UnmapUserMem(VMMappedUserMem *handle) // IN: Handle to mapped memory
{
unsigned int i;
if (handle == NULL) {
return;
}
printk(KERN_DEBUG "%s: numPages = %"FMTSZ"u, addr = 0x%p\n",
__func__, handle->numPages, handle->addr);
if (handle->numPages > 1) {
vunmap(handle->addr);
} else {
kunmap(handle->pages[0]);
}
for (i = 0; i < handle->numPages; i++) {
put_page(handle->pages[i]);
}
kfree(handle);
}
/*
*-----------------------------------------------------------------------------
*
* HostIF_SafeRDMSR --
*
* Attempt to read a MSR, and handle the exception if the MSR
* is unimplemented.
*
* Results:
* 0 if successful, and MSR value is returned via *val.
*
* If the MSR is unimplemented, *val is set to 0, and a
* non-zero value is returned: -1 for Win32, -EFAULT for Linux,
* and 1 for MacOS.
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
int
HostIF_SafeRDMSR(unsigned int msr, // IN
uint64 *val) // OUT: MSR value
{
int ret;
unsigned low, high;
asm volatile("2: rdmsr ; xor %0,%0\n"
"1:\n\t"
".section .fixup,\"ax\"\n\t"
"3: mov %4,%0 ; jmp 1b\n\t"
".previous\n\t"
VMW_ASM_EXTABLE(2b, 3b)
: "=r"(ret), "=a"(low), "=d"(high)
: "c"(msr), "i"(-EFAULT), "1"(0), "2"(0)); // init eax/edx to 0
*val = (low | ((u64)(high) << 32));
return ret;
}
Modifier le fichier /usr/lib/vmware/modules/source/vmnet-only/bridge.c afin d'obtenir le suivant:
/*********************************************************
* Copyright (C) 1998-2013, 2017 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation version 2 and no later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*********************************************************/
#include "driver-config.h"
#define EXPORT_SYMTAB
#include <linux/kernel.h>
#include <linux/version.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/mm.h>
#include "compat_skbuff.h"
#include <linux/sockios.h>
#include <linux/spinlock.h>
#include "compat_sock.h"
#define __KERNEL_SYSCALLS__
#include <asm/io.h>
#include <linux/proc_fs.h>
#include <linux/file.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <net/tcp.h>
#include <net/ipv6.h>
#ifdef CONFIG_NET_RADIO
# include <linux/wireless.h>
#endif
#include "vmnetInt.h"
#include "compat_netdevice.h"
#include "vnetInt.h"
#include "smac.h"
#define VNET_BRIDGE_HISTORY 48
/*
* Bytes reserved before start of packet. As Ethernet header has 14 bytes,
* to get aligned IP header we must skip 2 bytes before packet. Not that it
* matters a lot for us, but using 2 is compatible with what newer 2.6.x
* kernels do.
*/
#ifndef NET_IP_ALIGN
#define NET_IP_ALIGN 2
#endif
#if LOGLEVEL >= 4
static struct timeval vnetTime;
#endif
typedef struct VNetBridge VNetBridge;
struct VNetBridge {
struct notifier_block notifier; // for device state changes
char name[VNET_NAME_LEN]; // name of net device (e.g., "eth0")
struct net_device *dev; // device structure for 'name'
struct sock *sk; // socket associated with skb's
struct packet_type pt; // used to add packet handler
Bool enabledPromisc; // track if promisc enabled
Bool warnPromisc; // tracks if warning has been logged
Bool forceSmac; // whether to use smac unconditionally
struct sk_buff *history[VNET_BRIDGE_HISTORY]; // avoid duplicate packets
spinlock_t historyLock; // protects 'history'
VNetPort port; // connection to virtual hub
Bool wirelessAdapter; // connected to wireless adapter?
struct SMACState *smac; // device structure for wireless
VNetEvent_Sender *eventSender; // event sender
};
typedef PacketStatus (* SMACINT SMACFunc)(struct SMACState *, SMACPackets *);
static int VNetBridgeUp(VNetBridge *bridge, Bool rtnlLock);
static void VNetBridgeDown(VNetBridge *bridge, Bool rtnlLock);
static int VNetBridgeNotify(struct notifier_block *this, u_long msg,
void *data);
static int VNetBridgeReceiveFromDev(struct sk_buff *skb,
struct net_device *dev,
struct packet_type *pt,
struct net_device *real_dev);
static void VNetBridgeFree(VNetJack *this);
static void VNetBridgeReceiveFromVNet(VNetJack *this, struct sk_buff *skb);
static Bool VNetBridgeCycleDetect(VNetJack *this, int generation);
static Bool VNetBridgeIsDeviceWireless(struct net_device *dev);
static void VNetBridgePortsChanged(VNetJack *this);
static int VNetBridgeIsBridged(VNetJack *this);
static int VNetBridgeProcRead(char *page, char **start, off_t off,
int count, int *eof, void *data);
static void VNetBridgeComputeHeaderPosIPv6(struct sk_buff *skb);
static PacketStatus VNetCallSMACFunc(struct SMACState *state,
struct sk_buff **skb, void *startOfData,
SMACFunc func, unsigned int len);
/*
*----------------------------------------------------------------------
*
* VNetBridgeStartPromisc --
*
* Set IFF_PROMISC on the peer interface.
*
* Results:
* None.
*
* Side effects:
* The peer interface IFF_PROMISC flag may be changed.
*
*----------------------------------------------------------------------
*/
static void
VNetBridgeStartPromisc(VNetBridge *bridge, // IN:
Bool rtnlLock) // IN: Acquire RTNL lock
{
struct net_device *dev = bridge->dev;
/*
* Disable wireless cards from going into promiscous mode because those
* cards which do support RF monitoring would not be able to function
* correctly i.e. they would not be able to send data packets.
*/
if (rtnlLock) {
rtnl_lock();
}
if (!bridge->enabledPromisc && !bridge->wirelessAdapter) {
dev_set_promiscuity(dev, 1);
bridge->enabledPromisc = TRUE;
bridge->warnPromisc = FALSE;
LOG(0, (KERN_NOTICE "bridge-%s: enabled promiscuous mode\n",
bridge->name));
}
if (rtnlLock) {
rtnl_unlock();
}
}
/*
*----------------------------------------------------------------------
*
* VNetBridgeStopPromisc --
*
* Restore saved IFF_PROMISC on the peer interface.
*
* Results:
* None.
*
* Side effects:
* The peer interface IFF_PROMISC flag may be changed.
*
*----------------------------------------------------------------------
*/
static void
VNetBridgeStopPromisc(VNetBridge *bridge, // IN:
Bool rtnlLock) // IN: Acquire RTNL lock
{
struct net_device *dev = bridge->dev;
if (rtnlLock) {
rtnl_lock();
}
if (bridge->enabledPromisc && !bridge->wirelessAdapter) {
dev_set_promiscuity(dev, -1);
bridge->enabledPromisc = FALSE;
LOG(0, (KERN_NOTICE "bridge-%s: disabled promiscuous mode\n",
bridge->name));
}
if (rtnlLock) {
rtnl_unlock();
}
}
/*
*----------------------------------------------------------------------
*
* VNetBridgeDevCompatible --
*
* Check whether bridge and network device are compatible.
*
* Results:
* Non-zero if device is good enough for bridge. Zero otherwise.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
static INLINE_SINGLE_CALLER int
VNetBridgeDevCompatible(VNetBridge *bridge, // IN: Bridge
struct net_device *net) // IN: Network device
{
#ifdef VMW_NETDEV_HAS_NET
if (compat_dev_net(net) != &init_net) {
return 0;
}
#endif
return strcmp(net->name, bridge->name) == 0;
}
/*
*----------------------------------------------------------------------
*
* VNetBridge_Create --
*
* Creates a bridge. Allocates struct, allocates internal device,
* initializes port/jack, and creates a proc entry. Finally, creates an
* event sender and register itself with the kernel for device state
* change notifications.
*
* At this time the bridge is not yet plugged into the hub, because this
* will be done by the caller, i.e. the driver. But we need to know the
* hub in order to create an event sender. This allows for enabling
* the notification mechanism, which will instantly start firing, which in
* turn will bring up the bridge (if present), which eventually will
* inject bridge events. Moreover, the bridge will start injecting
* packets, which will be dropped on the floor. All in all, this is not
* that elegant. Alternatively, we could (i) plug into the hub inside of
* this function, which would require adding a few parameters, (ii) split
* the function into a create part and a registration part. Both ways are
* not consistent with how driver.c plugs the ports into the hub.
*
* Results:
* Errno. Also returns an allocated jack to connect to,
* NULL on error.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
VNetBridge_Create(const char *devName, // IN: name of device (e.g., "eth0")
uint32 flags, // IN: configuration flags
VNetJack *hubJack, // IN: the future hub
VNetPort **ret) // OUT: port to virtual hub
{
VNetBridge *bridge = NULL;
static unsigned id = 0;
int retval = 0;
*ret = NULL;
/*
* Its an error if device name is empty.
*/
if (devName[0] == '\0') {
retval = -EINVAL;
goto out;
}
/* complain about unknown/unsupported flags */
if (flags & ~VNET_BRFLAG_FORCE_SMAC) {
retval = -EINVAL;
goto out;
}
/*
* Allocate bridge structure
*/
bridge = kmalloc(sizeof *bridge, GFP_USER);
if (bridge == NULL) {
retval = -ENOMEM;
goto out;
}
memset(bridge, 0, sizeof *bridge);
spin_lock_init(&bridge->historyLock);
memcpy(bridge->name, devName, sizeof bridge->name);
NULL_TERMINATE_STRING(bridge->name);
/*
* Initialize jack.
*/
bridge->port.id = id++;
bridge->port.next = NULL;
bridge->port.jack.peer = NULL;
bridge->port.jack.numPorts = 1;
VNetSnprintf(bridge->port.jack.name, sizeof bridge->port.jack.name,
"bridge%u", bridge->port.id);
bridge->port.jack.private = bridge;
bridge->port.jack.index = 0;
bridge->port.jack.procEntry = NULL;
bridge->port.jack.free = VNetBridgeFree;
bridge->port.jack.rcv = VNetBridgeReceiveFromVNet;
bridge->port.jack.cycleDetect = VNetBridgeCycleDetect;
bridge->port.jack.portsChanged = VNetBridgePortsChanged;
bridge->port.jack.isBridged = VNetBridgeIsBridged;
/*
* Make proc entry for this jack.
*/
retval = VNetProc_MakeEntry(bridge->port.jack.name, S_IFREG, bridge,
VNetBridgeProcRead,
&bridge->port.jack.procEntry);
if (retval) {
if (retval == -ENXIO) {
bridge->port.jack.procEntry = NULL;
} else {
goto out;
}
}
/*
* Rest of fields.
*/
bridge->port.flags = IFF_RUNNING;
memset(bridge->port.paddr, 0, sizeof bridge->port.paddr);
memset(bridge->port.ladrf, 0, sizeof bridge->port.ladrf);
bridge->port.paddr[0] = VMX86_STATIC_OUI0;
bridge->port.paddr[1] = VMX86_STATIC_OUI1;
bridge->port.paddr[2] = VMX86_STATIC_OUI2;
bridge->port.fileOpRead = NULL;
bridge->port.fileOpWrite = NULL;
bridge->port.fileOpIoctl = NULL;
bridge->port.fileOpPoll = NULL;
/* misc. configuration */
bridge->forceSmac = (flags & VNET_BRFLAG_FORCE_SMAC) ? TRUE : FALSE;
/* create event sender */
retval = VNetHub_CreateSender(hubJack, &bridge->eventSender);
if (retval != 0) {
goto out;
}
/*
* on RHEL3 Linux 2.4.21-47 (others maybe too) the notifier does not fire
* and bring up the bridge as expected, thus we bring it up manually
* *before* registering the notifier (PR306435)
*/
VNetBridgeUp(bridge, TRUE);
/*
* register notifier for network device state change notifications, the
* notifier will fire right away, and the notifier handler will bring up
* the bridge (see exception above)
*/
bridge->notifier.notifier_call = VNetBridgeNotify;
bridge->notifier.priority = 0;
register_netdevice_notifier(&bridge->notifier);
/* return bridge */
*ret = &bridge->port;
LOG(1, (KERN_DEBUG "bridge-%s: attached\n", bridge->name));
return 0;
out:
if (bridge != NULL) {
kfree(bridge);
}
return retval;
}
/*
*----------------------------------------------------------------------
*
* VNetBridgeFree --
*
* Unregister from device state notifications, disable the bridge,
* destroy sender, remove proc entry, cleanup smac, and deallocate
* struct.
*
* Results:
* None.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
void
VNetBridgeFree(VNetJack *this) // IN: jack to free
{
VNetBridge *bridge = (VNetBridge*)this->private;
/* unregister notifier */
if (bridge->notifier.notifier_call != NULL) {
int err;
err = compat_unregister_netdevice_notifier(&bridge->notifier);
if (err != 0) {
LOG(0, (KERN_NOTICE "Can't unregister netdevice notifier (%d)\n",
err));
}
bridge->notifier.notifier_call = NULL;
}
/* disable bridge */
if (bridge->dev != NULL) {
LOG(1, (KERN_DEBUG "bridge-%s: disabling the bridge\n", bridge->name));
VNetBridgeDown(bridge, TRUE);
}
/* destroy event sender */
VNetEvent_DestroySender(bridge->eventSender);
bridge->eventSender = NULL;
/* remove /proc entry */
if (this->procEntry) {
VNetProc_RemoveEntry(this->procEntry);
}
if (bridge->smac){
SMAC_CleanupState(&(bridge->smac));
}
/* free bridge */
LOG(1, (KERN_DEBUG "bridge-%s: detached\n", bridge->name));
kfree(bridge);
}
/*
*----------------------------------------------------------------------
*
* VNetCallSMACFunc --
*
* Wrapper for SMAC functions. The skb must be linear.
*
* Results:
* Packet Status.
*
* Side effects:
* The skb buffer is freed if not successful otherwise it points to
* the clone.
*
*----------------------------------------------------------------------
*/
static PacketStatus
VNetCallSMACFunc(struct SMACState *state, // IN: pointer to state
struct sk_buff **skb, // IN/OUT: packet to process
void *startOfData, // IN: points to start of data
SMACFunc func, // IN: function to be called
unsigned int len) // IN: length including ETH header
{
SMACPackets packets = { {0} };
PacketStatus status;
SKB_LINEAR_ASSERT(*skb);
packets.orig.skb = *skb;
packets.orig.startOfData = startOfData;
packets.orig.len = len;
status = func(state, &packets);
if (status != PacketStatusForwardPacket) {
dev_kfree_skb(*skb);
return status;
}
if (packets.clone.skb) {
dev_kfree_skb(*skb);
*skb = packets.clone.skb;
}
return status;
}
/*
*----------------------------------------------------------------------
*
* VNetBridgeReceiveFromVNet --
*
* This jack is receiving a packet from a vnet. This function
* sends down (i.e., out on the host net device) if the packet
* isn't destined for the host, and it sends up (i.e.,
* simulates a receive for the host) if the packet
* satisfies the host's packet filter.
*
* When the function sends up it keeps a reference to the
* packet in a history list so that we can avoid handing
* a VM a copy of its own packet.
*
* Results:
* None.
*
* Side effects:
* Frees skb. Checks if host device is still using
* promiscuous mode.
*
*----------------------------------------------------------------------
*/
void
VNetBridgeReceiveFromVNet(VNetJack *this, // IN: jack
struct sk_buff *skb) // IN: pkt to receive
{
VNetBridge *bridge = (VNetBridge*)this->private;
struct net_device *dev = bridge->dev;
uint8 dest[ETH_ALEN];
struct sk_buff *clone;
LOG(3, (KERN_DEBUG "bridge-%s: transmit %d\n",
bridge->name, (int) skb->len));
if (!dev) {
dev_kfree_skb(skb);
return;
}
/*
* skb might be freed by wireless code, so need to keep
* a local copy of the MAC rather than a pointer to it.
*/
memcpy(dest, SKB_2_DESTMAC(skb), ETH_ALEN);
#ifdef notdef
// xxx;
/*
* We need to send the packet both up to the host and down
* to the interface.
* However, we ignore packets destined only for this hub.
*/
for (i = 0; i < VNET_PORTS_PER_HUB; i++) {
VNetPort *p = &port->hub->port[i];
if (UP_AND_RUNNING(p->flags) && MAC_EQ(dest, p->paddr)) {
return;
}
}
#endif
/*
* SMAC processing. SMAC interfaces that the skb is linear, so ensure that
* this is the case prior to calling out.
*/
if (bridge->smac) {
if (compat_skb_is_nonlinear(skb) && compat_skb_linearize(skb)) {
LOG(4, (KERN_NOTICE "bridge-%s: couldn't linearize, packet dropped\n",
bridge->name));
return;
}
if (VNetCallSMACFunc(bridge->smac, &skb, skb->data,
SMAC_CheckPacketToHost, skb->len) !=
PacketStatusForwardPacket) {
LOG(4, (KERN_NOTICE "bridge-%s: packet dropped\n", bridge->name));
return;
}
}
/*
* Send down (imitate packet_sendmsg)
*
* Do this only if the packet is not addressed to the peer,
* and the packet size is not too big.
*/
dev_lock_list();
if (MAC_EQ(dest, dev->dev_addr) ||
skb->len > dev->mtu + dev->hard_header_len) {
dev_unlock_list();
} else {
# if 0 // XXX we should do header translation
if ((dev->flags & IFF_SOFTHEADERS) != 0) {
if (skb->len > dev->mtu) {
clone = NULL;
} else {
clone = dev_alloc_skb(skb->len + dev->hard_header_len, GFP_ATOMIC);
}
if (clone != NULL) {
skb_reserve(clone, dev->hard_header_len);
if (dev->hard_header != NULL) {
dev->hard_header(clone, dev, ETH_P_IP, NULL, NULL, skb->len);
}
memcpy(skb_put(clone, skb->len), skb->data, skb->len);
}
}
# endif
clone = skb_clone(skb, GFP_ATOMIC);
if (clone == NULL) {
dev_unlock_list();
} else {
skb_set_owner_w(clone, bridge->sk);
clone->protocol = ((struct ethhdr *)skb->data)->h_proto; // XXX
if ((dev->flags & IFF_UP) != 0) {
dev_unlock_list();
DEV_QUEUE_XMIT(clone, dev, 0);
} else {
dev_unlock_list();
dev_kfree_skb(clone);
}
}
}
/*
* Send up (imitate Ethernet receive)
*
* Do this if the packet is addressed to the peer (or is broadcast, etc.).
*
* This packet will get back to us, via VNetBridgeReceive.
* We save it so we can recognize it (and its clones) again.
*/
if (VNetPacketMatch(dest, dev->dev_addr, NULL, 0, allMultiFilter, dev->flags)) {
clone = skb_clone(skb, GFP_ATOMIC);
if (clone) {
unsigned long flags;
int i;
clone = skb_get(clone);
clone->dev = dev;
clone->protocol = eth_type_trans(clone, dev);
spin_lock_irqsave(&bridge->historyLock, flags);
for (i = 0; i < VNET_BRIDGE_HISTORY; i++) {
if (bridge->history[i] == NULL) {
bridge->history[i] = clone;
# if LOGLEVEL >= 3
{
int j;
int count = 0;
for (j = 0; j < VNET_BRIDGE_HISTORY; j++) {
if (bridge->history[j] != NULL) {
count++;
}
}
LOG(3, (KERN_DEBUG "bridge-%s: host slot %d history %d\n",
bridge->name, i, count));
}
# endif
break;
}
}
if (i >= VNET_BRIDGE_HISTORY) {
LOG(1, (KERN_NOTICE "bridge-%s: history full\n",
bridge->name));
for (i = 0; i < VNET_BRIDGE_HISTORY; i++) {
struct sk_buff *s = bridge->history[i];
/*
* We special case 0 to avoid races with another thread on
* another cpu wanting to use the 0 entry. This could happen
* when we release the lock to free the former entry.
* See bug 11231 for details.
*/
if (i == 0) {
bridge->history[0] = clone;
} else {
bridge->history[i] = NULL;
}
if (s) {
spin_unlock_irqrestore(&bridge->historyLock, flags);
dev_kfree_skb(s);
spin_lock_irqsave(&bridge->historyLock, flags);
}
}
}
spin_unlock_irqrestore(&bridge->historyLock, flags);
/*
* We used to cli() before calling netif_rx() here. It was probably
* unneeded (as we never did it in netif.c, and the code worked). In
* any case, now that we are using netif_rx_ni(), we should certainly
* not do it, or netif_rx_ni() will deadlock on the cli() lock --hpreg
*/
netif_rx_ni(clone);
# if LOGLEVEL >= 4
do_gettimeofday(&vnetTime);
# endif
}
}
// xxx;
dev_kfree_skb(skb);
}
/*
*----------------------------------------------------------------------
*
* VNetBridgeCycleDetect --
*
* Cycle detection algorithm.
*
* Results:
* TRUE if a cycle was detected, FALSE otherwise.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
Bool
VNetBridgeCycleDetect(VNetJack *this, // IN: jack
int generation) // IN: generation
{
VNetBridge *bridge = (VNetBridge*)this->private;
return VNetCycleDetectIf(bridge->name, generation);
}
/*
*----------------------------------------------------------------------
*
* VNetBridgePortsChanged --
*
* The number of ports connected to this jack has change, react
* accordingly by starting/stopping promiscuous mode based on
* whether any peers exist.
*
* Results:
* None.
*
* Side effects:
* Promiscuous mode may be started or stopped.
*
*----------------------------------------------------------------------
*/
void
VNetBridgePortsChanged(VNetJack *this) // IN: jack
{
VNetBridge *bridge = (VNetBridge*)this->private;
if (bridge->dev) {
if (VNetGetAttachedPorts(this)) {
VNetBridgeStartPromisc(bridge, TRUE);
} else {
VNetBridgeStopPromisc(bridge, TRUE);
}
}
}
/*
*----------------------------------------------------------------------
*
* VNetBridgeIsBridged --
*
* Reports if the bridged interface is up or down.
*
* Results:
* 1 - we are bridged but the interface is not up
* 2 - we are bridged and the interface is up
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
VNetBridgeIsBridged(VNetJack *this) // IN: jack
{
VNetBridge *bridge = (VNetBridge*)this->private;
if (bridge->dev) {
return 2;
} else {
return 1;
}
}
/*
*----------------------------------------------------------------------
*
* VNetBridgeIsDeviceWireless --
*
* Check if the device is a wireless adapter, depending on the version
* of the wireless extension present in the kernel.
*
* Results:
* TRUE if the device is wireless, FALSE otherwise.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
static Bool
VNetBridgeIsDeviceWireless(struct net_device *dev) //IN: sock
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 22)
# if defined(CONFIG_WIRELESS_EXT)
return dev->ieee80211_ptr != NULL || dev->wireless_handlers != NULL;
# else
return dev->ieee80211_ptr != NULL;
# endif
#elif defined(CONFIG_WIRELESS_EXT)
return dev->wireless_handlers != NULL;
#elif !defined(CONFIG_NET_RADIO)
return FALSE;
#elif defined WIRELESS_EXT && WIRELESS_EXT > 19
return dev->wireless_handlers != NULL;
#elif defined WIRELESS_EXT && WIRELESS_EXT > 12
return dev->wireless_handlers != NULL || dev->get_wireless_stats != NULL;
#else
return dev->get_wireless_stats != NULL;
#endif
}
/*
*----------------------------------------------------------------------
*
* VNetBridgeSendLinkStateEvent --
*
* Sends a link state event.
*
* Results:
* Returns 0 if successful, or a negative value if an error occurs.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
static int
VNetBridgeSendLinkStateEvent(VNetBridge *bridge, // IN: the bridge
uint32 adapter, // IN: the adapter
Bool up) // IN: the link state
{
VNet_LinkStateEvent event;
int res;
event.header.size = sizeof event;
res = VNetEvent_GetSenderId(bridge->eventSender, &event.header.senderId);
if (res != 0) {
LOG(1, (KERN_NOTICE "bridge-%s: can't send link state event, "
"getSenderId failed (%d)\n", bridge->name, res));
return res;
}
event.header.eventId = 0;
event.header.classSet = VNET_EVENT_CLASS_UPLINK;
event.header.type = VNET_EVENT_TYPE_LINK_STATE;
event.adapter = adapter;
event.up = up;
res = VNetEvent_Send(bridge->eventSender, &event.header);
if (res != 0) {
LOG(1, (KERN_NOTICE "bridge-%s: can't send link state event, send "
"failed (%d)\n", bridge->name, res));
}
return res;
}
/*
*----------------------------------------------------------------------
*
* VNetBridgeUp --
*
* Bring a bridge up. Gets peer's device structure, verifies
* that interface is up, checks the header length,
* allocates a socket, adds a packet handler to the network
* stack, and then places the peer's device in promiscuous
* mode.
*
* Results:
* errno.
*
* Side effects:
* Bridging may be brought up with a peer interface.
*
*----------------------------------------------------------------------
*/
static int
VNetBridgeUp(VNetBridge *bridge, // IN: bridge struct
Bool rtnlLock) // IN: acquire RTNL lock
{
int retval = 0;
if (bridge->dev != NULL) {
LOG(0, (KERN_NOTICE "bridge-%s: already up\n", bridge->name));
goto out;
}
/*
* Get peer device structure
*/
dev_lock_list();
bridge->dev = DEV_GET(bridge);
LOG(2, (KERN_DEBUG "bridge-%s: got dev %p\n",
bridge->name, bridge->dev));
if (bridge->dev == NULL) {
dev_unlock_list();
retval = -ENODEV;
goto out;
}
if (!(bridge->dev->flags & IFF_UP)) {
LOG(2, (KERN_DEBUG "bridge-%s: interface %s is not up\n",
bridge->name, bridge->dev->name));
dev_unlock_list();
retval = -ENODEV;
goto out;
}
/*
* At a minimum, the header size should be the same as ours.
*
* XXX we should either do header translation or ensure this
* is an Ethernet.
*/
if (bridge->dev->hard_header_len != ETH_HLEN) {
LOG(1, (KERN_DEBUG "bridge-%s: can't bridge with %s, bad header length %d\n",
bridge->name, bridge->dev->name, bridge->dev->hard_header_len));
dev_unlock_list();
retval = -EINVAL;
goto out;
}
/*
* Get a socket to play with
*
* We set the dead field so we don't get a call back from dev_kfree_skb().
* (The alternative is to support the callback.)
*/
bridge->sk = compat_sk_alloc(bridge, GFP_ATOMIC);
if (bridge->sk == NULL) {
dev_unlock_list();
retval = -ENOMEM;
goto out;
}
sock_init_data(NULL, bridge->sk);
sock_set_flag(bridge->sk, SOCK_DEAD);
if (VNetBridgeIsDeviceWireless(bridge->dev)) {
LOG(1, (KERN_NOTICE "bridge-%s: device is wireless, enabling SMAC\n",
bridge->name));
bridge->wirelessAdapter = TRUE;
}
/*
* If it is a wireless adapter initialize smac struct.
*/
if (bridge->wirelessAdapter || bridge->forceSmac) {
SMAC_InitState(&(bridge->smac));
if (bridge->smac) {
/*
* Store the MAC address of the adapter
*/
SMAC_SetMac(bridge->smac, bridge->dev->dev_addr);
}
}
/*
* Link up with the peer device by adding a
* packet handler to the networking stack.
*/
bridge->pt.func = VNetBridgeReceiveFromDev;
bridge->pt.type = htons(ETH_P_ALL);
bridge->pt.dev = bridge->dev;
bridge->pt.af_packet_priv = bridge->sk;
bridge->enabledPromisc = FALSE;
bridge->warnPromisc = FALSE;
dev_add_pack(&bridge->pt);
dev_unlock_list();
/*
* Put in promiscuous mode if need be.
*/
mutex_lock(&vnetStructureMutex);
if (VNetGetAttachedPorts(&bridge->port.jack)) {
VNetBridgeStartPromisc(bridge, rtnlLock);
}
mutex_unlock(&vnetStructureMutex);
/* send link state up event */
retval = VNetBridgeSendLinkStateEvent(bridge, bridge->dev->ifindex, TRUE);
if (retval != 0) {
LOG(1, (KERN_NOTICE "bridge-%s: can't send link state event (%d)\n",
bridge->name, retval));
goto out;
}
LOG(1, (KERN_DEBUG "bridge-%s: up\n", bridge->name));
/*
* Return
*/
out:
if (retval != 0) {
if (bridge->sk != NULL) {
sk_free(bridge->sk);
bridge->sk = NULL;
}
bridge->dev = NULL;
}
return retval;
}
/*
*----------------------------------------------------------------------
*
* VNetBridgeDown --
*
* Bring a bridge down. Stops promiscuous mode, removes the
* packet handler from the network stack, and frees the
* socket.
*
* Results:
* None.
*
* Side effects:
* Bridging is brought down.
*
*----------------------------------------------------------------------
*/
static void
VNetBridgeDown(VNetBridge *bridge, // IN: bridge
Bool rtnlLock) // IN: acquire RTNL lock
{
int retval;
if (bridge->dev == NULL) {
LOG(0, (KERN_NOTICE "bridge-%s: already down\n", bridge->name));
return;
}
/* send link state down event */
retval = VNetBridgeSendLinkStateEvent(bridge, bridge->dev->ifindex, FALSE);
if (retval != 0) {
LOG(1, (KERN_NOTICE "bridge-%s: can't send link state event (%d)\n",
bridge->name, retval));
}
VNetBridgeStopPromisc(bridge, rtnlLock);
if (bridge->smac){
SMAC_SetMac(bridge->smac, NULL);
}
bridge->dev = NULL;
dev_remove_pack(&bridge->pt);
sk_free(bridge->sk);
bridge->sk = NULL;
LOG(1, (KERN_DEBUG "bridge-%s: down\n", bridge->name));
}
/*
*-----------------------------------------------------------------------------
*
* VNetBridgeNotifyLogBridgeUpError --
*
* Logs a bridge up error for the notify function following this function.
*
* Results:
* None.
*
* Side effects:
* None.
*
*-----------------------------------------------------------------------------
*/
static void
VNetBridgeNotifyLogBridgeUpError(int errno, // IN: the error number
char *bridgeName, // IN: the bridge name
char *devName) // IN: the device name
{
switch (errno) {
case -ENODEV:
LOG(0, (KERN_WARNING "bridge-%s: interface %s not found or not "
"up\n", bridgeName, devName));
break;
case -EINVAL:
LOG(0, (KERN_WARNING "bridge-%s: interface %s is not a valid "
"Ethernet interface\n", bridgeName, devName));
break;
case -ENOMEM:
LOG(0, (KERN_WARNING "bridge-%s: failed to allocate memory\n",
bridgeName));
break;
default:
/* This should never happen --hpreg */
LOG(0, (KERN_WARNING "bridge-%s: failed to enable the bridge to "
"interface %s (error %d)\n", bridgeName, devName,
-errno));
break;
}
}
/*
*-----------------------------------------------------------------------------
*
* VNetBridgeNotify --
*
* Callback on peer device state change. The function brings
* the bridge up/down in response to changes in the peer device.
*
* Results:
* NOTIFY_DONE
*
* Side effects:
* Promiscuous mode is changed when bridge brought up/down.
*
*-----------------------------------------------------------------------------
*/
static int
VNetBridgeNotify(struct notifier_block *this, // IN: callback data (bridge)
u_long msg, // IN: type of event
void *data) // IN: net_device or notifier info
{
VNetBridge *bridge = list_entry(this, VNetBridge, notifier);
struct net_device *dev;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 11, 0)
dev = netdev_notifier_info_to_dev(data);
#else
dev = (struct net_device *)data;
#endif
switch (msg) {
case NETDEV_UNREGISTER:
LOG(2, (KERN_DEBUG "bridge-%s: interface %s is unregistering\n",
bridge->name, dev->name));
if (dev == bridge->dev) {
/* This should never happen --hpreg */
LOG(0, (KERN_WARNING "bridge-%s: interface %s unregistered without "
"going down! Disabling the bridge\n", bridge->name,
dev->name));
VNetBridgeDown(bridge, FALSE);
}
break;
case NETDEV_DOWN:
LOG(2, (KERN_DEBUG "bridge-%s: interface %s is going down\n",
bridge->name, dev->name));
if (dev == bridge->dev) {
LOG(1, (KERN_DEBUG "bridge-%s: disabling the bridge on dev down\n",
bridge->name));
VNetBridgeDown(bridge, FALSE);
}
break;
case NETDEV_UP:
LOG(2, (KERN_DEBUG "bridge-%s: interface %s is going up\n",
bridge->name, dev->name));
if (bridge->dev == NULL && VNetBridgeDevCompatible(bridge, dev)) {
int errno;
LOG(1, (KERN_DEBUG "bridge-%s: enabling the bridge on dev up\n",
bridge->name));
errno = VNetBridgeUp(bridge, FALSE);
if (errno != 0) {
VNetBridgeNotifyLogBridgeUpError(errno, bridge->name, dev->name);
}
}
break;
default:
LOG(2, (KERN_DEBUG "bridge-%s: interface %s is sending notification "
"0x%lx\n", bridge->name, dev->name, msg));
break;
}
return NOTIFY_DONE;
}
/*
*----------------------------------------------------------------------
*
* RangeInLinearSKB --
*
* Checks if the given number of bytes from a given offset resides
* within the linear part of the skb. If not then attempts to
* linearize the skb.
*
* Results:
* Returns TRUE if the range of bytes is already in the linear
* portion or if linearize succeeded. Otherwise, returns FALSE if
* the linearize operation fails.
*
* Side effects:
* As in skb_linearize().
*
*----------------------------------------------------------------------
*/
static INLINE_SINGLE_CALLER Bool
RangeInLinearSKB(struct sk_buff *skb, // IN:
unsigned int start, // IN: Start offset
unsigned int length) // IN: How many bytes
{
if (LIKELY(!compat_skb_is_nonlinear(skb) ||
start + length <= compat_skb_headlen(skb))) {
/*
* Nothing to do.
*/
return TRUE;
}
return compat_skb_linearize(skb) == 0;
}
/*
* Not all kernel versions have NEXTHDR_MOBILITY defined.
*/
#ifndef NEXTHDR_MOBILITY
# define NEXTHDR_MOBILITY 135 /* Mobility header. */
#endif
/*
*----------------------------------------------------------------------
*
* VNetBridgeComputeHeaderPosIPv6 --
*
* Compute correct position of transport header in IPv6 packets.
*
* Results:
* None.
*
* Side effects:
* Transport header pointer updated to point to the PDU contained
* in the packet.
*
*----------------------------------------------------------------------
*/
static void
VNetBridgeComputeHeaderPosIPv6(struct sk_buff *skb) // IN:
{
struct ipv6hdr *ipv6Hdr;
unsigned int offset; /* Offset from skb->data. */
unsigned int headerLen; /* Length of current header. */
uint8 nextHeader;
/*
* Check if the start of the network header is within the linear part of
* skb. If not, then linearize the skb.
*/
if (UNLIKELY(compat_skb_network_header(skb) < skb->data ||
compat_skb_network_header(skb) >= skb->data +
compat_skb_headlen(skb))) {
if (compat_skb_linearize(skb)) {
return; /* Bail out. */
}
}
offset = compat_skb_network_offset(skb);
if (!RangeInLinearSKB(skb, offset, sizeof *ipv6Hdr)) {
return; /* Bail out. */
}
ipv6Hdr = (struct ipv6hdr *)compat_skb_network_header(skb);
headerLen = sizeof *ipv6Hdr;
offset += headerLen; /* End of IPv6 header (not including extensions). */
/*
* All IPv6 extension headers begin with a "next header" field (one byte),
* and most of them have a "header length" field (as the 2nd byte). In each
* iteration, we find the length of the extension header and add it to
* offset from the beginning of skb. And, in each iteration we update the
* next header variable. When we return from the following for loop, offset
* would have incremented by the length of each of the extension header,
* and next header type will be something else than an IPv6 extension header
* signifying that we have walked through the entire IPv6 header. We set
* the transport header's offset to the value of this offset before exiting
* the for loop.
*/
nextHeader = ipv6Hdr->nexthdr;
for (;;) {
switch (nextHeader) {
case NEXTHDR_HOP:
case NEXTHDR_ROUTING:
case NEXTHDR_AUTH:
case NEXTHDR_DEST:
case NEXTHDR_MOBILITY:
/*
* We need to check two bytes in the option header: next header and
* header extension length.
*/
if (!RangeInLinearSKB(skb, offset, 2)) {
return; /* Bail out. */
}
headerLen = skb->data[offset + 1];
if (nextHeader == NEXTHDR_AUTH) {
headerLen = (headerLen + 2) << 2; /* See RFC 2402. */
} else {
headerLen = (headerLen + 1) << 3; /* See ipv6_optlen(). */
}
break;
case NEXTHDR_FRAGMENT:
case NEXTHDR_ESP:
case NEXTHDR_NONE:
/*
* We stop walking if we find the fragment header (NEXTHDR_FRAGMENT).
* If the payload is encrypted we may not know the start of the
* transport header [1]. So, we just return. Same applies when
* nothing follows this header (NEXTHDR_NONE).
* [1]: http://www.cu.ipv6tf.org/literatura/chap8.pdf
*/
return;
default:
/*
* We have walked through all IPv6 extension headers. Let's set the
* transport header and return.
*/
compat_skb_set_transport_header(skb, offset);
return;
}
nextHeader = skb->data[offset];
offset += headerLen;
}
}
/*
*----------------------------------------------------------------------
*
* VNetBridgeComputeHeaderPos --
*
* Compute correct position for UDP/TCP header.
*
* Results:
* None.
*
* Side effects:
* transport header pointer updated to point to the tcp/udp header.
*
*----------------------------------------------------------------------
*/
static INLINE_SINGLE_CALLER void
VNetBridgeComputeHeaderPos(struct sk_buff *skb) // IN: buffer to examine
{
/* Maybe some kernel gets it right... */
if (compat_skb_network_header_len(skb)) {
return;
}
switch (be16_to_cpu(skb->protocol)) {
case ETH_P_IP: {
struct iphdr *ipHdr = compat_skb_ip_header(skb);
compat_skb_set_transport_header(skb, compat_skb_network_offset(skb) +
ipHdr->ihl * 4);
break;
}
case ETH_P_IPV6:
VNetBridgeComputeHeaderPosIPv6(skb);
break;
default:
LOG(3, (KERN_DEBUG "Unknown EII protocol %04X: csum at %d\n",
be16_to_cpu(skb->protocol), compat_skb_csum_offset(skb)));
break;
}
}
/*
* We deal with three types of kernels:
* New kernels: skb_shinfo() has gso_size member, and there is
* skb_gso_segment() helper to split GSO skb into flat ones.
* Older kernels: skb_shinfo() has tso_size member, and there is
* no helper.
* Oldest kernels: without any segmentation offload support.
*/
#if defined(NETIF_F_GSO) || LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 18)
#define VNetBridgeIsGSO(skb) skb_shinfo(skb)->gso_size
#define VNetBridgeGSOSegment(skb) skb_gso_segment(skb, 0)
#elif defined(NETIF_F_TSO)
#define VNetBridgeIsGSO(skb) skb_shinfo(skb)->tso_size
/*
*----------------------------------------------------------------------
*
* VNetBridgeGSOSegment --
*
* Split a large TCP/IPv4 sk_buff into multiple sk_buffs of
* size skb_shinfo(skb)->tso_size
* Called from VNetBridgeSendLargePacket().
*
* Results:
* List of skbs created.
*
* Side effects:
* The incoming packet is split into multiple packets.
*
*----------------------------------------------------------------------
*/
static struct sk_buff *
VNetBridgeGSOSegment(struct sk_buff *skb) // IN: packet to split
{
struct sk_buff *segs = NULL;
struct sk_buff **next = &segs;
int bytesPerPacket, bytesLeft;
int macHdrLen, ipHdrLen, tcpHdrLen, allHdrLen;
int curByteOffset;
uint16 ipID;
uint32 seqNo;
if (((struct ethhdr *)compat_skb_mac_header(skb))->h_proto != htons(ETH_P_IP)) {
return ERR_PTR(-EPFNOSUPPORT);
}
if (compat_skb_ip_header(skb)->protocol != IPPROTO_TCP) {
return ERR_PTR(-EPROTONOSUPPORT);
}
macHdrLen = compat_skb_network_header(skb) - compat_skb_mac_header(skb);
ipHdrLen = compat_skb_ip_header(skb)->ihl << 2;
tcpHdrLen = compat_skb_tcp_header(skb)->doff << 2;
allHdrLen = macHdrLen + ipHdrLen + tcpHdrLen;
ipID = ntohs(compat_skb_ip_header(skb)->id);
seqNo = ntohl(compat_skb_tcp_header(skb)->seq);
/* Host TCP stack populated this (MSS) for the host NIC driver */
bytesPerPacket = skb_shinfo(skb)->tso_size;
bytesLeft = skb->len - allHdrLen;
curByteOffset = allHdrLen;
while (bytesLeft) {
struct sk_buff *newSkb;
int payloadSize = (bytesLeft < bytesPerPacket) ? bytesLeft : bytesPerPacket;
newSkb = dev_alloc_skb(payloadSize + allHdrLen + NET_IP_ALIGN);
if (!newSkb) {
while (segs) {
newSkb = segs;
segs = segs->next;
newSkb->next = NULL;
dev_kfree_skb(newSkb);
}
return ERR_PTR(-ENOMEM);
}
skb_reserve(newSkb, NET_IP_ALIGN);
newSkb->dev = skb->dev;
newSkb->protocol = skb->protocol;
newSkb->pkt_type = skb->pkt_type;
newSkb->ip_summed = VM_TX_CHECKSUM_PARTIAL;
/*
* MAC+IP+TCP copy
* This implies that ALL fields in the IP and TCP headers are copied from
* the original skb. This is convenient: we'll only fix up fields that
* need to be changed below
*/
memcpy(skb_put(newSkb, allHdrLen), skb->data, allHdrLen);
/* Fix up pointers to different layers */
compat_skb_reset_mac_header(newSkb);
compat_skb_set_network_header(newSkb, macHdrLen);
compat_skb_set_transport_header(newSkb, macHdrLen + ipHdrLen);
/* Payload copy */
skb_copy_bits(skb, curByteOffset, compat_skb_tail_pointer(newSkb), payloadSize);
skb_put(newSkb, payloadSize);
curByteOffset+=payloadSize;
bytesLeft -= payloadSize;
/* Fix up IP hdr */
compat_skb_ip_header(newSkb)->tot_len = htons(payloadSize + tcpHdrLen + ipHdrLen);
compat_skb_ip_header(newSkb)->id = htons(ipID);
compat_skb_ip_header(newSkb)->check = 0;
/* Recompute new IP checksum */
compat_skb_ip_header(newSkb)->check =
ip_fast_csum(compat_skb_network_header(newSkb),
compat_skb_ip_header(newSkb)->ihl);
/* Fix up TCP hdr */
compat_skb_tcp_header(newSkb)->seq = htonl(seqNo);
/* Clear FIN/PSH if not last packet */
if (bytesLeft > 0) {
compat_skb_tcp_header(newSkb)->fin = 0;
compat_skb_tcp_header(newSkb)->psh = 0;
}
/* Recompute partial TCP checksum */
compat_skb_tcp_header(newSkb)->check =
~csum_tcpudp_magic(compat_skb_ip_header(newSkb)->saddr,
compat_skb_ip_header(newSkb)->daddr,
payloadSize+tcpHdrLen, IPPROTO_TCP, 0);
/* Offset of field */
newSkb->csum = offsetof(struct tcphdr, check);
/* Join packet to the list of segments */
*next = newSkb;
next = &newSkb->next;
/* Bump up our counters */
ipID++;
seqNo += payloadSize;
}
return segs;
}
#else
#define VNetBridgeIsGSO(skb) (0)
#define VNetBridgeGSOSegment(skb) ERR_PTR(-ENOSYS)
#endif
/*
*----------------------------------------------------------------------
*
* VNetBridgeSendLargePacket --
*
* Split and send a large TCP/IPv4 sk_buff into multiple sk_buffs which
* fits on wire. Called from VNetBridgeReceiveFromDev(), which is a
* protocol handler called from the bottom half, so steady as she
* goes...
*
* skb passed in is deallocated by function.
*
* Results:
* None.
*
* Side effects:
* The incoming packet is split into multiple packets and sent to the
* vnet.
*
*----------------------------------------------------------------------
*/
void
VNetBridgeSendLargePacket(struct sk_buff *skb, // IN: packet to split
VNetBridge *bridge) // IN: bridge
{
struct sk_buff *segs;
segs = VNetBridgeGSOSegment(skb);
dev_kfree_skb(skb);
if (IS_ERR(segs)) {
LOG(1, (KERN_DEBUG "bridge-%s: cannot segment packet: error %ld\n",
bridge->name, PTR_ERR(segs)));
return;
}
while (segs) {
struct sk_buff *newSkb;
newSkb = segs;
segs = newSkb->next;
newSkb->next = NULL;
/* Send it along */
skb = newSkb;
VNetSend(&bridge->port.jack, newSkb);
}
}
/*
*----------------------------------------------------------------------
*
* VNetBridgeReceiveFromDev --
*
* Receive a packet from a bridged peer device
*
* This is called from the bottom half. Must be careful.
*
* Results:
* errno.
*
* Side effects:
* A packet may be sent to the vnet.
*
*----------------------------------------------------------------------
*/
int
VNetBridgeReceiveFromDev(struct sk_buff *skb, // IN: packet to receive
struct net_device *dev, // IN: unused
struct packet_type *pt, // IN: pt (pointer to bridge)
struct net_device *real_dev) // IN: real device, unused
{
VNetBridge *bridge = list_entry(pt, VNetBridge, pt);
int i;
unsigned long flags;
if (bridge->dev == NULL) {
LOG(3, (KERN_DEBUG "bridge-%s: received %d closed\n",
bridge->name, (int) skb->len));
dev_kfree_skb(skb);
return -EIO; // value is ignored anyway
}
/*
* Check is this is a packet that we sent up to the host, and if
* so then don't bother to receive the packet.
*/
spin_lock_irqsave(&bridge->historyLock, flags);
for (i = 0; i < VNET_BRIDGE_HISTORY; i++) {
struct sk_buff *s = bridge->history[i];
if (s != NULL &&
(s == skb || SKB_IS_CLONE_OF(skb, s))) {
bridge->history[i] = NULL;
spin_unlock_irqrestore(&bridge->historyLock, flags);
dev_kfree_skb(s);
LOG(3, (KERN_DEBUG "bridge-%s: receive %d self %d\n",
bridge->name, (int) skb->len, i));
dev_kfree_skb(skb);
return 0;
}
}
spin_unlock_irqrestore(&bridge->historyLock, flags);
# if LOGLEVEL >= 4
{
struct timeval now;
do_gettimeofday(&now);
LOG(3, (KERN_DEBUG "bridge-%s: time %d\n",
bridge->name,
(int)((now.tv_sec * 1000000 + now.tv_usec)
- (vnetTime.tv_sec * 1000000 + vnetTime.tv_usec))));
}
# endif
/*
* SMAC might linearize the skb, but linearizing a shared skb is a no-no,
* so check for sharing before calling out to SMAC.
*/
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb) {
return 0;
}
if (bridge->smac) {
/*
* Wireless driver processes the packet and processes the ethernet header
* and the length is reduced by the amount. We need the raw ethernet
* packet length hence add the ethernet header length for incoming
* packets.
*
* Note that SMAC interfaces assume skb linearity.
*/
if (compat_skb_is_nonlinear(skb) && compat_skb_linearize(skb)) {
LOG(4, (KERN_NOTICE "bridge-%s: couldn't linearize, packet dropped\n",
bridge->name));
return 0;
}
if (VNetCallSMACFunc(bridge->smac, &skb, compat_skb_mac_header(skb),
SMAC_CheckPacketFromHost, skb->len + ETH_HLEN) !=
PacketStatusForwardPacket) {
LOG(4, (KERN_NOTICE "bridge-%s: packet dropped\n", bridge->name));
return 0;
}
}
/*
* Unbelievable... Caller sets h.raw = nh.raw before invoking us...
*/
VNetBridgeComputeHeaderPos(skb);
skb_push(skb, skb->data - compat_skb_mac_header(skb));
LOG(3, (KERN_DEBUG "bridge-%s: receive %d\n",
bridge->name, (int) skb->len));
/*
* If this is a large packet, chop chop chop (if supported)...
*/
if (VNetBridgeIsGSO(skb)) {
VNetBridgeSendLargePacket(skb, bridge);
} else {
VNetSend(&bridge->port.jack, skb);
}
return 0;
}
/*
*----------------------------------------------------------------------
*
* VNetBridgeProcRead --
*
* Callback for read operation on this bridge entry in vnets proc fs.
*
* Results:
* Length of read operation.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
VNetBridgeProcRead(char *page, // IN/OUT: buffer to write into
char **start, // OUT: 0 if file < 4k, else offset into page
off_t off, // IN: (unused) offset of read into the file
int count, // IN: (unused) maximum number of bytes to read
int *eof, // OUT: TRUE if there is nothing more to read
void *data) // IN: client data - pointer to bridge
{
VNetBridge *bridge = (VNetBridge*)data;
int len = 0;
if (!bridge) {
return len;
}
len += VNetPrintPort(&bridge->port, page+len);
len += sprintf(page+len, "dev %s ", bridge->name);
len += sprintf(page+len, "\n");
*start = 0;
*eof = 1;
return len;
}
Puis reprendre la procedure decrite precedement :
# cd /usr/lib/vmware/modules/source/vmmon-only ; make
etc ...
En esperant que ca fera gagner a certains le temps que j'ai perdu afin d'installer ce *&@@@$ de machin proprietaire !!!
Bonne soiree