Fedora-Fr

Ce site est optimisé pour être consulté depuis un navigateur moderne dans lequel JavaScript est activé.

[Résolu] F27 et VMware-Player-12.5.9-7535481.x86_64 ... c'est possible ?

econdami

Bonjour,
Je sais qu'entre nous on est des adeptes du libre ... mais il nous faut interagir avec des camarades de jeux qui eux ne semblent pas avoir de problemes a utiliser des %$# proprietaires !
Ce petit post suit le precedent concernant l'installation de VMplayer apres upgrade vers FC26 !
J'en ai un peu bave avant de reussir a installer ce machin proprietaire, les principaux soucis venant principalement de problemes de compilation, alors comme d'habitude, si les heures que j'ai passe a trouver la solution peuvent en faire gagner a d'autres, je partage ...

La procedure reste tres similaire a la precedente, sauf qu'il faudra modifier quelques fichiers avant de compiler.

Donc apres avoir fait:

# tar -xvf /usr/lib/vmware/modules/source/vmmon.tar --directory /usr/lib/vmware/modules/source
# tar -xvf /usr/lib/vmware/modules/source/vmnet.tar --directory /usr/lib/vmware/modules/source

Avant de compiler il y a quelques modifications a faire dans certains fichiers:

- ajouter le fichier /usr/lib/vmware/modules/source/vmmon-only/include/compat_timer.h, qui contiendra:

#ifndef __COMPAT_TIMER_H__
#   define __COMPAT_TIMER_H__
#include <linux/timer.h>
#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) && !defined(timer_setup)
typedef unsigned long compat_timer_arg_t;
static inline void timer_setup(struct timer_list *timer,
				      void (*func)(compat_timer_arg_t),
				      unsigned int flags)
{
	init_timer(timer);
	timer->function = func;
	timer->data = 0;
	timer->flags = flags;
}
#else /* new timer interface since 4.15 */
typedef struct timer_list *compat_timer_arg_t;
#endif /* new timer interface since 4.15 */
#endif /* __COMPAT_TIMER_H__ */

Modifier le fichier /usr/lib/vmware/modules/source/vmmon-only/linux/driver.c afin d'obtenir le suivant:

/*********************************************************
 * Copyright (C) 1998-2017 VMware, Inc. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation version 2 and no later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 *
 *********************************************************/

/* Must come before any kernel header file */
#include "driver-config.h"

#define EXPORT_SYMTAB

#include "compat_timer.h"
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/poll.h>
#include <linux/preempt.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/wait.h>

#include <asm/hw_irq.h> /* for CALL_FUNCTION_VECTOR */

#include "compat_version.h"
#include "compat_module.h"
#include "compat_page.h"

#include "usercalldefs.h"

#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 16)
#error Linux before 2.6.16 is not supported
#endif

#include <asm/io.h>

#include "vmware.h"
#include "driverLog.h"
#include "driver.h"
#include "modulecall.h"
#include "vm_asm.h"
#include "vmx86.h"
#include "initblock.h"
#include "task.h"
#include "memtrack.h"
#include "task.h"
#include "cpuid.h"
#include "cpuid_info.h"
#include "circList.h"
#include "x86msr.h"

#ifdef VMX86_DEVEL
#include "private.h"
#endif

#include "hostif.h"
#include "hostif_priv.h"
#include "vmhost.h"

#include "vmmonInt.h"

static void LinuxDriverQueue(VMLinux *vmLinux);
static void LinuxDriverDequeue(VMLinux *vmLinux);
static Bool LinuxDriverCheckPadding(void);

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24)
#define VMW_NOPAGE_2624
#endif

#define VMMON_UNKNOWN_SWAP_SIZE -1ULL

struct VMXLinuxState linuxState;


/*
 *----------------------------------------------------------------------
 *
 * Device Driver Interface --
 *
 *      Runs the VM by implementing open/close/ioctl functions
 *
 *
 *----------------------------------------------------------------------
 */

static int LinuxDriver_Open(struct inode *inode, struct file *filp);

/*
 * gcc-4.5+ can name-mangle LinuxDriver_Ioctl, but our stack-size
 * script needs to find it.  So it shouldn't be static.  ("hidden"
 * visibility would be OK.)
 */
long LinuxDriver_Ioctl(struct file *filp, u_int iocmd,
                       unsigned long ioarg);

static int LinuxDriver_Close(struct inode *inode, struct file *filp);
static unsigned int LinuxDriverPoll(struct file *file, poll_table *wait);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
static int LinuxDriverFault(struct vm_fault *fault);
#elif defined(VMW_NOPAGE_2624)
static int LinuxDriverFault(struct vm_area_struct *vma, struct vm_fault *fault);
#else
static struct page *LinuxDriverNoPage(struct vm_area_struct *vma,
                                      unsigned long address,
                                      int *type);
#endif
static int LinuxDriverMmap(struct file *filp, struct vm_area_struct *vma);

static void LinuxDriverPollTimeout(unsigned long clientData);
static unsigned int LinuxDriverEstimateTSCkHz(void);

static struct vm_operations_struct vmuser_mops = {
#ifdef VMW_NOPAGE_2624
        .fault  = LinuxDriverFault
#else
        .nopage = LinuxDriverNoPage
#endif
};

static struct file_operations vmuser_fops;
static struct timer_list tscTimer;
static Atomic_uint32 tsckHz;
static VmTimeStart tsckHzStartTime;


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverEstimateTSCkHzWork --
 *
 *      Estimates TSC frequency in terms of cycles and system uptime
 *      elapsed since module init. At module init, the starting cycle
 *      count and uptime are recorded (in tsckHzStartTime) and a timer
 *      is scheduled to call this function after 4 seconds.
 *
 *      It is possible that vmx queries the TSC rate after module init
 *      but before the 4s timer expires. In that case, we just go ahead
 *      and compute the rate for the duration since the driver loaded.
 *      When the timer expires, the new computed value is dropped. If the
 *      query races with the timer, the first thread to write to 'tsckHz'
 *      wins.
 *
 *----------------------------------------------------------------------
 */

static void
LinuxDriverEstimateTSCkHzWork(void *data)
{
   VmTimeStart curTime;
   uint64 cycles;
   uint64 uptime;
   unsigned int khz;

   ASSERT(tsckHzStartTime.count != 0 && tsckHzStartTime.time != 0);
   Vmx86_ReadTSCAndUptime(&curTime);
   cycles = curTime.count - tsckHzStartTime.count;
   uptime = curTime.time  - tsckHzStartTime.time;
   khz    = Vmx86_ComputekHz(cycles, uptime);

   if (khz != 0) {
       if (Atomic_ReadIfEqualWrite(&tsckHz, 0, khz) == 0) {
          Log("TSC frequency estimated using system uptime: %u\n", khz);
       }
   } else if (Atomic_ReadIfEqualWrite(&tsckHz, 0, cpu_khz) == 0) {
       Log("Failed to compute TSC frequency, using cpu_khz: %u\n", cpu_khz);
   }
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverEstimateTSCkHz --
 *
 *      Returns the estimated TSC khz, cached in tscKhz. If tsckHz is
 *      0, the routine kicks off estimation work on CPU 0.
 *
 * Results:
 *
 *      Returns the estimated TSC khz value.
 *
 *----------------------------------------------------------------------
 */

static unsigned int
LinuxDriverEstimateTSCkHz(void)
{
   int err;
   uint32 khz;

   khz = Atomic_Read(&tsckHz);
   if (khz != 0) {
      return khz;
   }
   err = compat_smp_call_function_single(0, LinuxDriverEstimateTSCkHzWork,
                                         NULL, 1);
   /*
    * The smp function call may fail for two reasons, either
    * the function is not supportd by the kernel, or the cpu
    * went offline. In this unlikely event, we just perform
    * the work wherever we can.
    */
   if (err != 0) {
      LinuxDriverEstimateTSCkHzWork(NULL);
   }

   return Atomic_Read(&tsckHz);
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverEstimateTSCkHzDeferred --
 *
 *      Timer callback for deferred TSC rate estimation.
 *
 *----------------------------------------------------------------------
 */
static void
LinuxDriverEstimateTSCkHzDeferred(compat_timer_arg_t unused)
{
   LinuxDriverEstimateTSCkHz();
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverInitTSCkHz --
 *
 *      Initialize TSC khz rate.
 *
 *      We rely on the kernel estimated cycle rate in the exported
 *      variable tsc_khz. If the kernel has disabled tsc, tsc_khz
 *      will be 0, and we fall back on our own estimation routines.
 *
 * Side effects:
 *
 *      If tsc_khz is unusable, schedules a 4s timer for deferred
 *      khz estimation (see LinuxDriverEstimateTSCkHz).
 *
 *----------------------------------------------------------------------
 */

static void
LinuxDriverInitTSCkHz(void)
{
   unsigned int khz;
 
   khz = compat_tsc_khz();
   if (khz != 0) {
      Atomic_Write(&tsckHz, khz);
      Log("Using tsc_khz as TSC frequency: %u\n", khz);
      return;
   }

   Vmx86_ReadTSCAndUptime(&tsckHzStartTime);
   tscTimer.expires  = jiffies + 4 * HZ;
   add_timer(&tscTimer);
}

 
/*
 *----------------------------------------------------------------------
 *
 * init_module --
 *
 *      linux module entry point. Called by /sbin/insmod command
 *
 * Results:
 *      registers a device driver for a major # that depends
 *      on the uid. Add yourself to that list.  List is now in
 *      private/driver-private.c.
 *
 *----------------------------------------------------------------------
 */

int
init_module(void)
{
   int retval;

   DriverLog_Init("/dev/vmmon");
   HostIF_InitGlobalLock();

   if (!LinuxDriverCheckPadding()) {
      return -ENOEXEC;
   }

   CPUID_Init();
   if (!Task_Initialize()) {
      return -ENOEXEC;
   }

   /*
    * Initialize LinuxDriverPoll state
    */

   init_waitqueue_head(&linuxState.pollQueue);
   timer_setup(&tscTimer, LinuxDriverEstimateTSCkHzDeferred, 0);

   linuxState.fastClockThread = NULL;
   linuxState.fastClockFile = NULL;
   linuxState.fastClockRate = 0;
   linuxState.fastClockPriority = -20;
   linuxState.swapSize = VMMON_UNKNOWN_SWAP_SIZE;

   /*
    * Initialize the file_operations structure. Because this code is always
    * compiled as a module, this is fine to do it here and not in a static
    * initializer.
    */

   memset(&vmuser_fops, 0, sizeof vmuser_fops);
   vmuser_fops.owner = THIS_MODULE;
   vmuser_fops.poll = LinuxDriverPoll;
   vmuser_fops.unlocked_ioctl = LinuxDriver_Ioctl;
   vmuser_fops.compat_ioctl = LinuxDriver_Ioctl;
   vmuser_fops.open = LinuxDriver_Open;
   vmuser_fops.release = LinuxDriver_Close;
   vmuser_fops.mmap = LinuxDriverMmap;

#ifdef VMX86_DEVEL
   devel_init_module();
   linuxState.minor = 0;
   retval = register_chrdev(linuxState.major, linuxState.deviceName,
                            &vmuser_fops);
#else
   sprintf(linuxState.deviceName, "vmmon");
   linuxState.major = 10;
   linuxState.minor = 165;
   linuxState.misc.minor = linuxState.minor;
   linuxState.misc.name = linuxState.deviceName;
   linuxState.misc.fops = &vmuser_fops;

   retval = misc_register(&linuxState.misc);
#endif

   if (retval) {
      Warning("Module %s: error registering with major=%d minor=%d\n",
              linuxState.deviceName, linuxState.major, linuxState.minor);

      return -ENOENT;
   }
   Log("Module %s: registered with major=%d minor=%d\n",
       linuxState.deviceName, linuxState.major, linuxState.minor);

   HostIF_InitUptime();
   timer_setup(&tscTimer, LinuxDriverEstimateTSCkHzDeferred, 0);
   LinuxDriverInitTSCkHz();
   Vmx86_InitIDList();

   Log("Module %s: initialized\n", linuxState.deviceName);

   return 0;
}

/*
 *----------------------------------------------------------------------
 *
 * cleanup_module --
 *
 *      Called by /sbin/rmmod
 *
 *
 *----------------------------------------------------------------------
 */

void
cleanup_module(void)
{
   /*
    * XXX smp race?
    */
#ifdef VMX86_DEVEL
   unregister_chrdev(linuxState.major, linuxState.deviceName);
#else
   misc_deregister(&linuxState.misc);
#endif

   Log("Module %s: unloaded\n", linuxState.deviceName);

   del_timer_sync(&linuxState.pollTimer);
   del_timer_sync(&tscTimer);

   Task_Terminate();
   // Make sure fastClockThread is dead
   HostIF_FastClockLock(1);
   HostIF_SetFastClockRate(0);
   HostIF_FastClockUnlock(1);

   HostIF_CleanupUptime();
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriver_Open  --
 *
 *      called on open of /dev/vmmon or /dev/vmx86.$USER. Use count used
 *      to determine eventual deallocation of the module
 *
 * Side effects:
 *     Increment use count used to determine eventual deallocation of
 *     the module
 *
 *----------------------------------------------------------------------
 */

static int
LinuxDriver_Open(struct inode *inode, // IN
                 struct file *filp)   // IN
{
   VMLinux *vmLinux;

   vmLinux = kmalloc(sizeof *vmLinux, GFP_KERNEL);
   if (vmLinux == NULL) {
      return -ENOMEM;
   }
   memset(vmLinux, 0, sizeof *vmLinux);

   sema_init(&vmLinux->lock4Gb, 1);
   init_waitqueue_head(&vmLinux->pollQueue);

   filp->private_data = vmLinux;
   LinuxDriverQueue(vmLinux);

   Vmx86_Open();

   return 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverAllocPages --
 *
 *    Allocate physically contiguous block of memory with specified order.
 *    Pages in the allocated block are configured so that caller can pass
 *    independent pages to the VM.
 *
 * Results:
 *    Zero on success, non-zero (error code) on failure.
 *
 * Side effects:
 *    None
 *
 *-----------------------------------------------------------------------------
 */

static int
LinuxDriverAllocPages(unsigned int gfpFlag, // IN
                      unsigned int order,   // IN
                      struct page **pg,     // OUT
                      unsigned int size)    // IN
{
   struct page* page;

   page = alloc_pages(gfpFlag, order);
   if (page) {
      unsigned int i;

      /*
       * Grab an extra reference on all pages except first one - first
       * one was already refcounted by alloc_pages.
       *
       * Under normal situation all pages except first one in the block
       * have refcount zero.  As we pass these pages to the VM, we must
       * bump their count, otherwise VM will release these pages every
       * time they would be unmapped from user's process, causing crash.
       *
       * Note that this depends on Linux VM internals.  It works on all
       * kernels we care about.
       */

      order = 1 << order;
      for (i = 0; i < order; i++) {
         if (i) {
            /*
             * Debug kernels assert that page->_count is not zero when
             * calling get_page. We use init_page_count as a temporary
             * workaround. PR 894174
             */
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 16)
            ASSERT(page_count(page) == 0);
            init_page_count(page);
#else
            get_page(page);
#endif
         }
         if (i >= size) {
            put_page(page);
         } else {
            void *addr = kmap(page);

            memset(addr, 0, PAGE_SIZE);
            kunmap(page);
            *pg++ = page;
         }
         page++;
      }

      return 0;
   }

   return -ENOMEM;
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverDestructor4Gb --
 *
 *    Deallocate all directly mappable memory.
 *
 * Results:
 *    None
 *
 * Side effects:
 *    None
 *
 *-----------------------------------------------------------------------------
 */

static void
LinuxDriverDestructor4Gb(VMLinux *vmLinux) // IN
{
   unsigned int pg;

   if (!vmLinux->size4Gb) {
      return;
   }
   for (pg = 0; pg < vmLinux->size4Gb; pg++) {
      put_page(vmLinux->pages4Gb[pg]);
   }
   vmLinux->size4Gb = 0;
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriver_Close  --
 *
 *      called on close of /dev/vmmon or /dev/vmx86.$USER, most often when the
 *      process exits. Decrement use count, allowing for possible uninstalling
 *      of the module.
 *
 *----------------------------------------------------------------------
 */

static int
LinuxDriver_Close(struct inode *inode, // IN
                  struct file *filp)   // IN
{
   VMLinux *vmLinux;

   vmLinux = (VMLinux *)filp->private_data;
   ASSERT(vmLinux);

   LinuxDriverDequeue(vmLinux);
   if (vmLinux->vm != NULL) {
      Vmx86_ReleaseVM(vmLinux->vm);
      vmLinux->vm = NULL;
   }

   Vmx86_Close();

   /*
    * Destroy all low memory allocations.
    * We are closing the struct file here, so clearly no other process
    * uses it anymore, and we do not need to hold the semaphore.
    */

   LinuxDriverDestructor4Gb(vmLinux);

   /*
    * Clean up poll state.
    */

   HostIF_PollListLock(0);
   if (vmLinux->pollBack != NULL) {
      if ((*vmLinux->pollBack = vmLinux->pollForw) != NULL) {
         vmLinux->pollForw->pollBack = vmLinux->pollBack;
      }
   }
   HostIF_PollListUnlock(0);
   // XXX call wake_up()?
   HostIF_UnmapUserMem(vmLinux->pollTimeoutHandle);

   kfree(vmLinux);
   filp->private_data = NULL;

   return 0;
}


#define POLLQUEUE_MAX_TASK 1000
static DEFINE_SPINLOCK(pollQueueLock);
static void *pollQueue[POLLQUEUE_MAX_TASK];
static unsigned int pollQueueCount = 0;


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverQueuePoll --
 *
 *      Remember that current process waits for next timer event.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

static INLINE_SINGLE_CALLER void
LinuxDriverQueuePoll(void)
{
   unsigned long flags;

   spin_lock_irqsave(&pollQueueLock, flags);

   /*
    * Under normal circumstances every process should be listed
    * only once in this array. If it becomes problem that process
    * can be in the array twice, walk array! Maybe you can keep
    * it sorted by 'current' value then, making IsPollQueued
    * a bit faster...
    */

   if (pollQueueCount < POLLQUEUE_MAX_TASK) {
      pollQueue[pollQueueCount++] = current;
   }
   spin_unlock_irqrestore(&pollQueueLock, flags);
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverIsPollQueued --
 *
 *      Determine whether timer event occurred since we queued for it using
 *      LinuxDriverQueuePoll.
 *
 * Results:
 *      0    Event already occurred.
 *      1    Event did not occur yet.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

static INLINE_SINGLE_CALLER int
LinuxDriverIsPollQueued(void)
{
   unsigned long flags;
   unsigned int i;
   int retval = 0;

   spin_lock_irqsave(&pollQueueLock, flags);
   for (i = 0; i < pollQueueCount; i++) {
      if (current == pollQueue[i]) {
         retval = 1;
         break;
      }
   }
   spin_unlock_irqrestore(&pollQueueLock, flags);

   return retval;
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverFlushPollQueue --
 *
 *      Signal to queue that timer event occurred.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

static INLINE_SINGLE_CALLER void
LinuxDriverFlushPollQueue(void)
{
   unsigned long flags;

   spin_lock_irqsave(&pollQueueLock, flags);
   pollQueueCount = 0;
   spin_unlock_irqrestore(&pollQueueLock, flags);
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverWakeUp --
 *
 *      Wake up processes waiting on timer event.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

void
LinuxDriverWakeUp(Bool selective)  // IN:
{
   if (selective && linuxState.pollList != NULL) {
      struct timeval tv;
      VmTimeType now;
      VMLinux *p;
      VMLinux *next;

      HostIF_PollListLock(1);
      do_gettimeofday(&tv);
      now = tv.tv_sec * 1000000ULL + tv.tv_usec;

      for (p = linuxState.pollList; p != NULL; p = next) {
         next = p->pollForw;

         if (p->pollTime <= now) {
            if ((*p->pollBack = next) != NULL) {
               next->pollBack = p->pollBack;
            }
            p->pollForw = NULL;
            p->pollBack = NULL;
            wake_up(&p->pollQueue);
         }
      }
      HostIF_PollListUnlock(1);
   }

   LinuxDriverFlushPollQueue();
   wake_up(&linuxState.pollQueue);
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverPoll  --
 *
 *      This is used to wake up the VMX when a user call arrives, or
 *      to wake up select() or poll() at the next clock tick.
 *
 *----------------------------------------------------------------------
 */

static unsigned int
LinuxDriverPoll(struct file *filp,  // IN:
                poll_table *wait)   // IN:
{
   VMLinux *vmLinux = (VMLinux *) filp->private_data;
   unsigned int mask = 0;

   /*
    * Set up or check the timeout for fast wakeup.
    *
    * Thanks to Petr for this simple and correct implementation:
    *
    * There are four cases of wait == NULL:
    *    another file descriptor is ready in the same poll()
    *    just slept and woke up
    *    nonblocking poll()
    *    did not sleep due to memory allocation on 2.4.21-9.EL
    * In first three cases, it's okay to return POLLIN.
    * Unfortunately, for 4th variant we have to do some
    * bookkeeping to not return POLLIN when timer did not expire
    * yet.
    *
    * We may schedule a timer unnecessarily if an existing
    * timer fires between poll_wait() and timer_pending().
    *
    * -- edward
    */

   if (wait == NULL) {
      if (vmLinux->pollBack == NULL && !LinuxDriverIsPollQueued()) {
         mask = POLLIN;
      }
   } else {
      if (linuxState.fastClockThread && vmLinux->pollTimeoutPtr != NULL) {
         struct timeval tv;

         do_gettimeofday(&tv);
         poll_wait(filp, &vmLinux->pollQueue, wait);
         vmLinux->pollTime = *vmLinux->pollTimeoutPtr +
                                       tv.tv_sec * 1000000ULL + tv.tv_usec;
         if (vmLinux->pollBack == NULL) {
            HostIF_PollListLock(2);
            if (vmLinux->pollBack == NULL) {
               if ((vmLinux->pollForw = linuxState.pollList) != NULL) {
                  vmLinux->pollForw->pollBack = &vmLinux->pollForw;
               }
               linuxState.pollList = vmLinux;
               vmLinux->pollBack = &linuxState.pollList;
            }
            HostIF_PollListUnlock(2);
         }
      } else {
         LinuxDriverQueuePoll();
         poll_wait(filp, &linuxState.pollQueue, wait);

         if (!timer_pending(&linuxState.pollTimer)) {
            mod_timer(&linuxState.pollTimer, jiffies + 1);
         }
      }
   }

   return mask;
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverPollTimeout  --
 *
 *      Wake up a process waiting in poll/select.  This is called from
 *      the timer, and hence processed in the bottom half
 *
 *----------------------------------------------------------------------
 */

static void
LinuxDriverPollTimeout(unsigned long clientData)  // IN:
{
   LinuxDriverWakeUp(FALSE);
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverNoPage/LinuxDriverFault --
 *
 *      Callback for returning allocated page for memory mapping
 *
 * Results:
 *    NoPage:
 *      Page or page address on success, NULL or 0 on failure.
 *    Fault:
 *      Error code; 0, minor page fault.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
static int
LinuxDriverFault(struct vm_fault *fault)     //IN/OUT
#elif defined(VMW_NOPAGE_2624)
static int LinuxDriverFault(struct vm_area_struct *vma, //IN
                            struct vm_fault *fault)     //IN/OUT
#else
static struct page *LinuxDriverNoPage(struct vm_area_struct *vma, //IN
                                      unsigned long address,      //IN
                                      int *type)                  //OUT: Fault t
ype
#endif
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
   struct vm_area_struct *vma = fault->vma;
#endif
   VMLinux *vmLinux = (VMLinux *) vma->vm_file->private_data;
   unsigned long pg;
   struct page* page;

#ifdef VMW_NOPAGE_2624
   pg = fault->pgoff;
#else
   pg = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
#endif
   pg = VMMON_MAP_OFFSET(pg);
   if (pg >= vmLinux->size4Gb) {
#ifdef VMW_NOPAGE_2624
      return VM_FAULT_SIGBUS;
#else
      return 0;
#endif
   }
   page = vmLinux->pages4Gb[pg];
   get_page(page);
#ifdef VMW_NOPAGE_2624
   fault->page = page;
   return 0;
#else
   *type = VM_FAULT_MINOR;
   return page;
#endif
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverAllocContig --
 *
 *      Create mapping for contiguous memory areas.
 *
 * Results:
 *
 *      0       on success,
 *      -EINVAL on invalid arguments or
 *      -ENOMEM on out of memory
 *
 * Side effects:
 *      Pages for mapping are allocated.
 *
 *-----------------------------------------------------------------------------
 */

static int LinuxDriverAllocContig(VMLinux *vmLinux,
                                  struct vm_area_struct *vma,
                                  unsigned long off,
                                  unsigned long size)
{
   unsigned long vmaOrder = VMMON_MAP_ORDER(off);
   unsigned long vmaAllocSize;
   unsigned int gfpFlag;
   unsigned long i;

   if (VMMON_MAP_RSVD(off)) {
      /* Reserved bits set... */
      return -EINVAL;
   }
   if (VMMON_MAP_OFFSET(off)) {
      /* We do not need non-zero offsets... */
      return -EINVAL;
   }
   switch (VMMON_MAP_MT(off)) {
      case VMMON_MAP_MT_LOW4GB:
#ifdef GFP_DMA32
         gfpFlag = GFP_USER | GFP_DMA32;
#else
         gfpFlag = GFP_USER | GFP_DMA;
#endif
         break;
      case VMMON_MAP_MT_LOW16MB:
         gfpFlag = GFP_USER | GFP_DMA;
         break;
      case VMMON_MAP_MT_ANY:
         gfpFlag = GFP_HIGHUSER;
         break;
      default:
         /* Invalid memory type */
         return -EINVAL;
   }
   if (size > VMMON_MAP_OFFSET_MASK + 1) {
      /* Size is too big to fit to our window. */
      return -ENOMEM;
   }

   /* 16 pages looks like a good limit... */
   if (size > VMMON_MAX_LOWMEM_PAGES) {
      return -ENOMEM;
   }
   /* Sorry. Only one mmap per one open. */
   down(&vmLinux->lock4Gb);
   if (vmLinux->size4Gb) {
      up(&vmLinux->lock4Gb);
      return -EINVAL;
   }
   vmaAllocSize = 1 << vmaOrder;
   for (i = 0; i < size; i += vmaAllocSize) {
      int err;

      err = LinuxDriverAllocPages(gfpFlag, vmaOrder,
                                  vmLinux->pages4Gb + i, size - i);
      if (err) {
         while (i > 0) {
            put_page(vmLinux->pages4Gb[--i]);
         }
         up(&vmLinux->lock4Gb);

         return err;
      }
   }
   vmLinux->size4Gb = size;
   up(&vmLinux->lock4Gb);
   vma->vm_ops = &vmuser_mops;

   return 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverMmap --
 *
 *      Create mapping for lowmem or locked memory.
 *
 * Results:
 *
 *      0       on success,
 *      -EINVAL on invalid arguments or
 *      -ENOMEM on out of memory
 *
 * Side effects:
 *      Pages for mapping are allocated.
 *
 *-----------------------------------------------------------------------------
 */

static int
LinuxDriverMmap(struct file *filp,
                struct vm_area_struct *vma)
{
   VMLinux *vmLinux = (VMLinux *) filp->private_data;
   unsigned long size;
   int err;

   /* Only shared mappings */
   if (!(vma->vm_flags & VM_SHARED)) {
      return -EINVAL;
   }
   if ((vma->vm_end | vma->vm_start) & (PAGE_SIZE - 1)) {
      return -EINVAL;
   }
   size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
   if (size < 1) {
      return -EINVAL;
   }
   if (vmLinux->vm) {
      err = -EINVAL;
   } else {
      err = LinuxDriverAllocContig(vmLinux, vma, vma->vm_pgoff, size);
   }
   if (err) {
      return err;
   }
   /* Clear VM_IO, otherwise SuSE's kernels refuse to do get_user_pages */
   vma->vm_flags &= ~VM_IO;

   return 0;
}


typedef Bool (*SyncFunc)(void *data, unsigned cpu);

typedef struct {
   Atomic_uint32 numCPUs;
   Atomic_uint32 ready;
   Atomic_uint32 failures;
   Atomic_uint32 done;
   SyncFunc      func;
   void          *data;
} SyncFuncArgs;


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverSyncCallHook --
 *
 *      Called on each CPU, waits for them all to show up, and executes
 *      the callback.
 *
 * Results:
 *
 * Side effects:
 *      Whatever side effects the callback has.
 *
 *-----------------------------------------------------------------------------
 */

static void
LinuxDriverSyncCallHook(void *data)  // IN:
{
   Bool success;
   uint32 numCPUs;
   volatile unsigned iterations = 1000 * 1000;
   SyncFuncArgs *args = (SyncFuncArgs *)data;
   unsigned cpu = smp_processor_id();

   /*
    * We need to careful about reading cpu_online_map on kernels that
    * have hot add/remove cpu support.  The kernel's smp_call_function
    * blocks hot add from occuring between the time it computes the set
    * of cpus it will IPI and when all those cpus have entered their IPI
    * handlers.  Additionally, we disabled preemption on the initiating
    * cpu during the entire sync call sequence.  So, since a cpu hot add
    * is initiated from process context, a cpu cannot be hot added until
    * at least one cpu has exited this code, and therefore it is safe
    * for the first cpu to reach this point to read cpu_online_map.
    *
    * Hot remove works by stopping the entire machine, which is done by
    * waiting for a set of kernel threads to be scheduled on all cpus.
    * This cannot happen until all cpus are preemptible.  Since the
    * initiating cpu has preemption disabled during this entire
    * sequence, this code is also safe from cpu hot remove.
    *
    * So, the first cpu to reach this code will read the same value of
    * cpu_online_map that was used by smp_call_function, and therefore
    * we can safely assume that numCPUs cpus will execute this routine.
    */

   Atomic_CMPXCHG32(&args->numCPUs, 0, num_online_cpus());
   numCPUs = Atomic_Read(&args->numCPUs);

   Atomic_Inc(&args->ready);

   /*
    * Wait for all CPUs, but not forever since we could deadlock.  The
    * potential deadlock scenerio is this: cpu0 has IF=1 and holds a
    * lock.  cpu1 has IF=0 and is spinning waiting for the lock.
    */

   while (Atomic_Read(&args->ready) != numCPUs && --iterations) ;

   /* Now simultaneously call the routine. */
   success = args->func(args->data, cpu);

   if (!iterations || !success) {
      /* Indicate that we either timed out or the callback failed. */
      Atomic_Inc(&args->failures);
   }
   /* Indicate that we are finished. */
   Atomic_Inc(&args->done);
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverSyncCallOnEachCPU --
 *
 *      Calls func on each cpu at (nearly) the same time.
 *
 * Results:
 *      TRUE if func was called at the same time on all cpus.  Note that
 *      func is called regardless of whether all cpus showed up in time.
 *
 * Side effects:
 *      func's side effects, on all cpus.
 *
 *-----------------------------------------------------------------------------
 */

static Bool
LinuxDriverSyncCallOnEachCPU(SyncFunc func,  // IN:
                             void *data)     // IN:
{
   SyncFuncArgs args;
   uintptr_t flags;

   ASSERT(HostIF_GlobalLockIsHeld());

   args.func = func;
   args.data = data;

   Atomic_Write(&args.numCPUs, 0); // Must be calculated inside the callback.
   Atomic_Write(&args.ready, 0);
   Atomic_Write(&args.failures, 0);
   Atomic_Write(&args.done, 0);

   preempt_disable();

   /*
    * Call all other CPUs, but do not wait so we can enter the callback
    * on this CPU too.
    */

   compat_smp_call_function(LinuxDriverSyncCallHook, &args, 0);

   /*
    * smp_call_function doesn't return until all cpus have been
    * interrupted.  It's safe to disable interrupts now that all other
    * cpus are in their IPI handlers.
    */

   SAVE_FLAGS(flags);
   CLEAR_INTERRUPTS();

   LinuxDriverSyncCallHook(&args);

   RESTORE_FLAGS(flags);
   preempt_enable();

   /*
    * Wait for everyone else to finish so we can get an accurate
    * failures count.
    */

   while (Atomic_Read(&args.done) != Atomic_Read(&args.numCPUs)) ;

   /*
    * This routine failed if any CPU bailed out early to avoid deadlock,
    * or the callback routine failed on any CPU.  Both conditions are
    * recorded in the failures field.
    */

   return Atomic_Read(&args.failures) == 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverReadTSC --
 *
 *      Callback that is executed simultaneously on all cpus to read the TSCs.
 *
 * Results:
 *      TRUE.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

static Bool
LinuxDriverReadTSC(void *data,   // OUT: TSC values
                   unsigned cpu) // IN: the pcpu number
{
   TSCDelta *tscDelta = (TSCDelta *)data;
   uint64 tsc, old;

   if (LIKELY(CPUID_SSE2Supported())) {
      RDTSC_BARRIER();
   }
   tsc = RDTSC();

   /* Any looping means another CPU changed min/max. */
   do {
      old = Atomic_Read64(&tscDelta->min);
   } while (old > tsc && !Atomic_CMPXCHG64(&tscDelta->min, &old, &tsc));
   do {
      old = Atomic_Read64(&tscDelta->max);
   } while (old < tsc && !Atomic_CMPXCHG64(&tscDelta->max, &old, &tsc));

   return TRUE;
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverSyncReadTSCs --
 *
 *      Simultaneously read the TSCs on all cpus.
 *
 * Results:
 *      The set of all TSCs.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

__attribute__((always_inline)) static Bool
LinuxDriverSyncReadTSCs(uint64 *delta) // OUT: TSC max - TSC min
{
   TSCDelta tscDelta;
   unsigned i;
   Bool okay = FALSE;

   /* Take the global lock to block concurrent calls. */
   HostIF_GlobalLock(14);

   /* Loop to warm up the cache. */
   for (i = 0; i < 3; i++) {
      Atomic_Write64(&tscDelta.min, ~CONST64U(0));
      Atomic_Write64(&tscDelta.max, CONST64U(0));

      if (LinuxDriverSyncCallOnEachCPU(LinuxDriverReadTSC, &tscDelta)) {
         /* We return the last successful simultaneous read of the TSCs. */
         *delta = Atomic_Read64(&tscDelta.max) - Atomic_Read64(&tscDelta.min);
         okay = TRUE;
      }
   }
   HostIF_GlobalUnlock(14);

   return okay;
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriver_Ioctl --
 *
 *      Main path for UserRPC
 *
 *      Be VERY careful with stack usage; gcc's stack allocation is iffy
 *      and allocations from individual "case" statements do not overlap,
 *      so it is easy to use kilobytes of stack space here.
 *
 * Results:
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

long
LinuxDriver_Ioctl(struct file *filp,    // IN:
                  u_int iocmd,          // IN:
                  unsigned long ioarg)  // IN:
{
   VMLinux *vmLinux = (VMLinux *) filp->private_data;
   int retval = 0;
   Vcpuid vcpuid;
   VMDriver *vm;

   if (vmLinux == NULL) {
      return -EINVAL;
   }

   vm = vmLinux->vm;

   /*
    * Validate the VM pointer for those IOCTLs that require it.
    */

   switch (iocmd) {
   case IOCTL_VMX86_VERSION:
   case IOCTL_VMX86_CREATE_VM:
   case IOCTL_VMX86_INIT_CROSSGDT:
   case IOCTL_VMX86_SET_UID:
   case IOCTL_VMX86_GET_NUM_VMS:
   case IOCTL_VMX86_GET_TOTAL_MEM_USAGE:
   case IOCTL_VMX86_SET_HARD_LIMIT:
   case IOCTL_VMX86_PAE_ENABLED:
   case IOCTL_VMX86_VMX_ENABLED:
   case IOCTL_VMX86_GET_IPI_VECTORS:
   case IOCTL_VMX86_GET_KHZ_ESTIMATE:
   case IOCTL_VMX86_GET_ALL_CPUID:
   case IOCTL_VMX86_GET_ALL_MSRS:
   case IOCTL_VMX86_SET_POLL_TIMEOUT_PTR:
   case IOCTL_VMX86_GET_KERNEL_CLOCK_RATE:
   case IOCTL_VMX86_GET_REFERENCE_CLOCK_HZ:
   case IOCTL_VMX86_INIT_PSEUDO_TSC:
   case IOCTL_VMX86_CHECK_PSEUDO_TSC:
   case IOCTL_VMX86_GET_PSEUDO_TSC:
   case IOCTL_VMX86_SET_HOST_CLOCK_PRIORITY:
   case IOCTL_VMX86_SYNC_GET_TSCS:
   case IOCTL_VMX86_GET_UNAVAIL_PERF_CTRS:
      break;

   default:
      if (vm == NULL) {
         retval = -EINVAL;
         goto exit;
      }
   }

   /*
    * Perform the IOCTL operation.
    */

   switch (iocmd) {
   case IOCTL_VMX86_VERSION:
      retval = VMMON_VERSION;
      break;

   case IOCTL_VMX86_CREATE_VM:
      if (vm != NULL) {
         retval = -EINVAL;
         break;
      }

      vm = Vmx86_CreateVM();

      if (vm == NULL) {
         retval = -ENOMEM;
      } else {
         vmLinux->vm = vm;
         retval = vm->userID;
      }
      break;

   case IOCTL_VMX86_RELEASE_VM:
      vmLinux->vm = NULL;
      Vmx86_ReleaseVM(vm);
      break;

   case IOCTL_VMX86_ALLOC_CROSSGDT: {
      InitBlock initBlock;

      if (Task_AllocCrossGDT(&initBlock)) {
         retval = HostIF_CopyToUser((char *)ioarg, &initBlock,
                                    sizeof initBlock);
      } else {
         retval = -EINVAL;
      }
      break;
   }

   case IOCTL_VMX86_INIT_VM: {
      InitBlock initParams;

      retval = HostIF_CopyFromUser(&initParams, (char *)ioarg,
                                   sizeof initParams);
      if (retval != 0) {
         break;
      }
      if (Vmx86_InitVM(vm, &initParams)) {
         retval = -EINVAL;
         break;
      }
      retval = HostIF_CopyToUser((char *)ioarg, &initParams,
                                 sizeof initParams);
      break;
   }

   case IOCTL_VMX86_INIT_CROSSGDT: {
      InitCrossGDT initCrossGDT;

      retval = HostIF_CopyFromUser(&initCrossGDT, (char *)ioarg,
                                   sizeof initCrossGDT);

      if ((retval == 0) && Task_InitCrossGDT(&initCrossGDT)) {
         retval = -EIO;
      }
      break;
   }

   case IOCTL_VMX86_RUN_VM:
      vcpuid = ioarg;

      if (vcpuid >= vm->numVCPUs) {
         retval = -EINVAL;
         break;
      }
      retval = Vmx86_RunVM(vm, vcpuid);
      break;

   case IOCTL_VMX86_SET_UID:
#ifdef VMX86_DEVEL
      devel_suid();
#else
      retval = -EPERM;
#endif
      break;

   case IOCTL_VMX86_LOCK_PAGE: {
      VMLockPage args;

      retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
      if (retval) {
         break;
      }
      args.ret.status = Vmx86_LockPage(vm, args.uAddr, FALSE, &args.ret.mpn);
      retval = HostIF_CopyToUser((void *)ioarg, &args, sizeof args);
      break;
   }

   case IOCTL_VMX86_LOCK_PAGE_NEW: {
      VMLockPage args;

      retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
      if (retval) {
         break;
      }
      args.ret.status = Vmx86_LockPage(vm, args.uAddr, TRUE, &args.ret.mpn);
      retval = HostIF_CopyToUser((void *)ioarg, &args, sizeof args);
      break;
   }

   case IOCTL_VMX86_UNLOCK_PAGE: {
      VA64 uAddr;

      retval = HostIF_CopyFromUser(&uAddr, (void *)ioarg, sizeof uAddr);
      if (retval) {
         break;
      }
      retval = Vmx86_UnlockPage(vm, uAddr);
      break;
   }

   case IOCTL_VMX86_UNLOCK_PAGE_BY_MPN: {
      VMMUnlockPageByMPN args;

      retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
      if (retval) {
         break;
      }
      retval = Vmx86_UnlockPageByMPN(vm, args.mpn, args.uAddr);
      break;
   }

   case IOCTL_VMX86_LOOK_UP_MPN: {
      VMLockPage args;

      retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
      if (retval) {
         break;
      }
      args.ret.status = Vmx86_LookupUserMPN(vm, args.uAddr, &args.ret.mpn);
      retval = HostIF_CopyToUser((void *)ioarg, &args, sizeof args);
      break;
   }

   case IOCTL_VMX86_GET_NUM_VMS:
      retval = Vmx86_GetNumVMs();
      break;

   case IOCTL_VMX86_GET_TOTAL_MEM_USAGE:
      retval = Vmx86_GetTotalMemUsage();
      break;

   case IOCTL_VMX86_SET_HARD_LIMIT: {
      int32 limit;

      retval = HostIF_CopyFromUser(&limit, (void *)ioarg, sizeof limit);
      if (retval != 0) {
         break;
      }
      if (!Vmx86_SetConfiguredLockedPagesLimit(limit)) {
         retval = -EINVAL;
      }
      break;
   }

   case IOCTL_VMX86_ADMIT: {
      VMMemInfoArgs args;

      retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
      if (retval != 0) {
         break;
      }
      Vmx86_Admit(vm, &args);
      retval = HostIF_CopyToUser((void *)ioarg, &args, sizeof args);
      break;
   }

   case IOCTL_VMX86_READMIT: {
      OvhdMem_Deltas delta;

      retval = HostIF_CopyFromUser(&delta, (void *)ioarg, sizeof delta);
      if (retval != 0) {
         break;
      }
      if (!Vmx86_Readmit(vm, &delta)) {
         retval = -1;
      }

      break;
   }

   case IOCTL_VMX86_UPDATE_MEM_INFO: {
      VMMemMgmtInfoPatch patch;

      retval = HostIF_CopyFromUser(&patch, (void *)ioarg, sizeof patch);
      if (retval == 0) {
         Vmx86_UpdateMemInfo(vm, &patch);
      }
      break;
   }

   case IOCTL_VMX86_GET_MEM_INFO: {
      VA64 uAddr;
      VMMemInfoArgs *userVA;
      VMMemInfoArgs in;
      VMMemInfoArgs *out;

      retval = HostIF_CopyFromUser(&uAddr, (void *)ioarg, sizeof uAddr);
      if (retval) {
         break;
      }

      userVA = VA64ToPtr(uAddr);
      retval = HostIF_CopyFromUser(&in, userVA, sizeof in);
      if (retval) {
         break;
      }

      if (in.numVMs < 1 || in.numVMs > MAX_VMS) {
         retval = -EINVAL;
         break;
      }
      out = HostIF_AllocKernelMem(VM_GET_MEM_INFO_SIZE(in.numVMs), TRUE);
      if (!out) {
         retval = -ENOMEM;
         break;
      }

      *out = in;
      if (!Vmx86_GetMemInfo(vm, FALSE, out, VM_GET_MEM_INFO_SIZE(in.numVMs))) {
         HostIF_FreeKernelMem(out);
         retval = -ENOBUFS;
         break;
      }

      retval = HostIF_CopyToUser(userVA, out,
                                 VM_GET_MEM_INFO_SIZE(out->numVMs));
      HostIF_FreeKernelMem(out);
      break;
   }

   case IOCTL_VMX86_PAE_ENABLED:
      retval = Vmx86_PAEEnabled();
      break;

   case IOCTL_VMX86_VMX_ENABLED:
      retval = Vmx86_VMXEnabled();
      break;

   case IOCTL_VMX86_APIC_INIT: {
      VMAPICInfo info;
      Bool setVMPtr;
      Bool probe;

      retval = HostIF_CopyFromUser(&info, (VMAPICInfo *)ioarg, sizeof info);
      if (retval != 0) {
         break;
      }
      setVMPtr = ((info.flags & APIC_FLAG_DISABLE_NMI) != 0);
      probe = ((info.flags & APIC_FLAG_PROBE) != 0);

      /*
       * Kernel uses NMIs for deadlock detection - set APIC VMptr so that
       * NMIs get disabled in the monitor.
       */

      setVMPtr = TRUE;

      retval = HostIF_APICInit(vm, setVMPtr, probe) ? 0 : -ENODEV;
      break;
   }

   case IOCTL_VMX86_SET_HOST_CLOCK_RATE:
      retval = -Vmx86_SetHostClockRate(vm, (unsigned)ioarg);
      break;

   case IOCTL_VMX86_SEND_IPI: {
      VCPUSet ipiTargets;

      retval = HostIF_CopyFromUser(&ipiTargets, (VCPUSet *) ioarg,
                                   sizeof ipiTargets);

      if (retval == 0) {
         HostIF_IPI(vm, &ipiTargets);
      }

      break;
   }

   case IOCTL_VMX86_GET_IPI_VECTORS: {
      IPIVectors ipiVectors;

      ipiVectors.hostIPIVectors[0] = CALL_FUNCTION_VECTOR;
#ifdef CALL_FUNCTION_SINGLE_VECTOR
      ipiVectors.hostIPIVectors[1] = CALL_FUNCTION_SINGLE_VECTOR;
#else
      ipiVectors.hostIPIVectors[1] = 0;
#endif
      ipiVectors.monitorIPIVector = monitorIPIVector;
      ipiVectors.hvIPIVector      = hvIPIVector;

      retval = HostIF_CopyToUser((void *)ioarg, &ipiVectors,
                                  sizeof ipiVectors);
      break;
   }

   case IOCTL_VMX86_GET_KHZ_ESTIMATE:
      retval = LinuxDriverEstimateTSCkHz();
      break;

   case IOCTL_VMX86_GET_ALL_CPUID: {
      VA64 uAddr;
      CPUIDQuery *userVA;
      CPUIDQuery in;
      CPUIDQuery *out;

      retval = HostIF_CopyFromUser(&uAddr, (void *)ioarg, sizeof uAddr);
      if (retval) {
         break;
      }

      userVA = VA64ToPtr(uAddr);
      retval = HostIF_CopyFromUser(&in, userVA, sizeof in);
      if (retval) {
         break;
      }

      /*
       * Some kernels panic on kmalloc request larger than 128KB.
       * XXX This test should go inside HostIF_AllocKernelMem() then.
       */

      if (in.numLogicalCPUs >
                        (131072 - sizeof *out) / sizeof out->logicalCPUs[0]) {
         retval = -EINVAL;
         break;
      }
      out = HostIF_AllocKernelMem(
         sizeof *out + in.numLogicalCPUs * sizeof out->logicalCPUs[0],
         TRUE);
      if (!out) {
         retval = -ENOMEM;
         break;
      }

      *out = in;
      if (!HostIF_GetAllCpuInfo(out)) {
         HostIF_FreeKernelMem(out);
         retval = -ENOBUFS;
         break;
      }

      retval = HostIF_CopyToUser((int8 *)userVA + sizeof *userVA,
                                  &out->logicalCPUs[0],
                           out->numLogicalCPUs * sizeof out->logicalCPUs[0]);
      HostIF_FreeKernelMem(out);
      break;
   }

   case IOCTL_VMX86_GET_ALL_MSRS: {
      VA64 uAddr;
      MSRQuery *userVA;
      MSRQuery in;
      MSRQuery *out;

      retval = HostIF_CopyFromUser(&uAddr, (void *)ioarg, sizeof uAddr);
      if (retval) {
         break;
      }

      userVA = VA64ToPtr(uAddr);
      retval = HostIF_CopyFromUser(&in, userVA, sizeof in);
      if (retval) {
         break;
      }

      /*
       * Some kernels panic on kmalloc request larger than 128KB.
       * XXX This test should go inside HostIF_AllocKernelMem() then.
       */

      if (in.numLogicalCPUs >
                        (131072 - sizeof *out) / sizeof out->logicalCPUs[0]) {
         retval = -EINVAL;
         break;
      }
      out = HostIF_AllocKernelMem(
         sizeof *out + in.numLogicalCPUs * sizeof out->logicalCPUs[0],
         TRUE);
      if (!out) {
         retval = -ENOMEM;
         break;
      }

      *out = in;
      if (!Vmx86_GetAllMSRs(out)) {
         HostIF_FreeKernelMem(out);
         retval = -ENOBUFS;
         break;
      }

      retval = HostIF_CopyToUser((int8 *)userVA + sizeof *userVA,
                                  &out->logicalCPUs[0],
                            out->numLogicalCPUs * sizeof out->logicalCPUs[0]);
      HostIF_FreeKernelMem(out);
      break;
   }

   case IOCTL_VMX86_ALLOC_LOCKED_PAGES:
   case IOCTL_VMX86_FREE_LOCKED_PAGES: {
         VMMPNList req;

         retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
         if (retval) {
           break;
         }
         if (iocmd == IOCTL_VMX86_ALLOC_LOCKED_PAGES) {
            retval = Vmx86_AllocLockedPages(vm, req.mpnList,
                                            req.mpnCount, FALSE,
                                            req.ignoreLimits);
         } else {
            retval = Vmx86_FreeLockedPages(vm, req.mpnList,
                                           req.mpnCount, FALSE);
         }
         break;
      }

   case IOCTL_VMX86_GET_NEXT_ANON_PAGE: {
      VMMPNNext req;

      retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
      if (retval) {
         req.outMPN = INVALID_MPN;
      } else {
         req.outMPN = Vmx86_GetNextAnonPage(vm, req.inMPN);
      }
      retval = HostIF_CopyToUser((void *)ioarg, &req, sizeof req);
      break;
   }

   case IOCTL_VMX86_GET_LOCKED_PAGES_LIST: {
         VMMPNList req;

         retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
         if (retval) {
            break;
         }
         retval = Vmx86_GetLockedPageList(vm, req.mpnList, req.mpnCount);
         break;
      }

   case IOCTL_VMX86_READ_PAGE: {
         VMMReadWritePage req;

         retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
         if (retval) {
            break;
         }
         retval = HostIF_ReadPage(vm, req.mpn, req.uAddr, FALSE);
         break;
      }

   case IOCTL_VMX86_WRITE_PAGE: {
         VMMReadWritePage req;

         retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
         if (retval) {
            break;
         }
         retval = HostIF_WritePage(vm, req.mpn, req.uAddr, FALSE);
         break;
      }

   case IOCTL_VMX86_SET_POLL_TIMEOUT_PTR: {
      vmLinux->pollTimeoutPtr = NULL;
      HostIF_UnmapUserMem(vmLinux->pollTimeoutHandle);
      if (ioarg != 0) {
         vmLinux->pollTimeoutPtr = HostIF_MapUserMem((VA)ioarg,
                                              sizeof *vmLinux->pollTimeoutPtr,
                                                 &vmLinux->pollTimeoutHandle);

         if (vmLinux->pollTimeoutPtr == NULL) {
            retval = -EINVAL;
            break;
         }
      }
      break;
   }

   case IOCTL_VMX86_GET_KERNEL_CLOCK_RATE:
      retval = HZ;
      break;

   case IOCTL_VMX86_FAST_SUSP_RES_SET_OTHER_FLAG:
      retval = Vmx86_FastSuspResSetOtherFlag(vm, ioarg);
      break;

   case IOCTL_VMX86_FAST_SUSP_RES_GET_MY_FLAG:
      retval = Vmx86_FastSuspResGetMyFlag(vm, ioarg);
      break;

   case IOCTL_VMX86_GET_REFERENCE_CLOCK_HZ: {
      uint64 refClockHz = HostIF_UptimeFrequency();

      retval = HostIF_CopyToUser((void *)ioarg, &refClockHz,
                                 sizeof refClockHz);
      break;
   }

   case IOCTL_VMX86_INIT_PSEUDO_TSC: {
      PTSCInitParams params;

      retval = HostIF_CopyFromUser(&params, (void *)ioarg, sizeof params);
      if (retval != 0) {
         break;
      }
      Vmx86_InitPseudoTSC(&params);
      retval = HostIF_CopyToUser((void *)ioarg, &params, sizeof params);
      break;
   }

   case IOCTL_VMX86_CHECK_PSEUDO_TSC: {
      PTSCCheckParams params;

      retval = HostIF_CopyFromUser(&params, (void *)ioarg, sizeof params);
      if (retval != 0) {
         break;
      }
      params.usingRefClock = Vmx86_CheckPseudoTSC(&params.lastTSC,
                                                  &params.lastRC);

      retval = HostIF_CopyToUser((void *)ioarg, &params, sizeof params);
      break;
   }

   case IOCTL_VMX86_GET_PSEUDO_TSC: {
      uint64 ptsc = Vmx86_GetPseudoTSC();

      retval = HostIF_CopyToUser((void *)ioarg, &ptsc, sizeof ptsc);
      break;
   }

   case IOCTL_VMX86_SET_HOST_CLOCK_PRIORITY:
      /*
       * This affects the global fast clock priority, and it only
       * takes effect when the fast clock rate transitions from zero
       * to a non-zero value.
       *
       * This is used to allow VMs to optionally work around
       * bug 218750 by disabling our default priority boost. If any
       * VM chooses to apply this workaround, the effect is permanent
       * until vmmon is reloaded!
       */

      HostIF_FastClockLock(3);
      linuxState.fastClockPriority = MAX(-20, MIN(19, (int)ioarg));
      HostIF_FastClockUnlock(3);
      retval = 0;
      break;

   case IOCTL_VMX86_SYNC_GET_TSCS: {
      uint64 delta;

      if (LinuxDriverSyncReadTSCs(&delta)) {
         retval = HostIF_CopyToUser((void *)ioarg, &delta, sizeof delta);
       } else {
         retval = -EBUSY;
      }
      break;
   }

   case IOCTL_VMX86_SET_HOST_SWAP_SIZE: {
      uint64 swapSize;
      retval = HostIF_CopyFromUser(&swapSize, (void *)ioarg, sizeof swapSize);
      if (retval != 0) {
         Warning("Could not copy swap size from user, status %d\n", retval);
	 break;
      }
      linuxState.swapSize = swapSize;
      break;
   }

   case IOCTL_VMX86_GET_UNAVAIL_PERF_CTRS: {
      uint64 ctrs = Vmx86_GetUnavailablePerfCtrs();
      retval = HostIF_CopyToUser((void *)ioarg, &ctrs, sizeof ctrs);
      break;
   }

   default: 
      Warning("Unknown ioctl %d\n", iocmd);
      retval = -EINVAL;
   }

exit:
   return retval;
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverQueue --
 *
 *      add the vmLinux to the global queue
 *
 * Results:
 *
 *      void
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

static void
LinuxDriverQueue(VMLinux *vmLinux)  // IN/OUT:
{
   /*
    * insert in global vm queue
    */

   HostIF_GlobalLock(12);

   vmLinux->next = linuxState.head;
   linuxState.head = vmLinux;

   HostIF_GlobalUnlock(12);
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriveDequeue --
 *
 *      remove from active list
 *
 * Results:
 *
 *      void
 * Side effects:
 *      printk if it is not in the list (error condition)
 *
 *----------------------------------------------------------------------
 */

static void
LinuxDriverDequeue(VMLinux *vmLinux)  // IN/OUT:
{
   VMLinux **p;

   HostIF_GlobalLock(13);
   for (p = &linuxState.head; *p != vmLinux; p = &(*p)->next) {
      ASSERT(*p != NULL);
   }
   *p = vmLinux->next;
   vmLinux->next = NULL;
   HostIF_GlobalUnlock(13);
}


/*
 *----------------------------------------------------------------------
 *
 * CheckPadding --
 *
 *      check for expected padding --
 *      this check currently fails on the egcs compiler
 *
 * Results:
 *
 *      TRUE if the check succeeds -- module will be loaded
 *
 *
 *
 * Side effects:
 *      output to kernel log on error
 *
 *----------------------------------------------------------------------
 */

static Bool
LinuxDriverCheckPadding(void)
{
   DTRWords32 dtr;
   uint16 *x;

   memset(&dtr, 0, sizeof dtr);
   dtr.dtr.limit = 0x1111;
   dtr.dtr.offset = 0x22223333;

   x = (uint16 *) &dtr;

   if (x[0] == 0x1111 && x[1] == 0x3333 && x[2] == 0x2222) {
   } else {
      Warning("DTR padding\n");
      goto error;
   }

   return TRUE;

error:
   printk("/dev/vmmon: Cannot load module. Use standard gcc compiler\n");

   return FALSE;
}


MODULE_AUTHOR("VMware, Inc.");
MODULE_DESCRIPTION("VMware Virtual Machine Monitor.");
MODULE_LICENSE("GPL v2");
/*
 * Starting with SLE10sp2, Novell requires that IHVs sign a support agreement
 * with them and mark their kernel modules as externally supported via a
 * change to the module header. If this isn't done, the module will not load
 * by default (i.e., neither mkinitrd nor modprobe will accept it).
 */
MODULE_INFO(supported, "external");

Modifier le fichier /usr/lib/vmware/modules/source/vmmon-only/linux/hostif.c afin d'obtenir le suivant:

/*********************************************************
 * Copyright (C) 1998-2017 VMware, Inc. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation version 2 and no later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 *
 *********************************************************/

/*
 * hostif.c --
 *
 *    This file implements the platform-specific (here Linux) interface that
 *    the cross-platform code uses --hpreg
 *
 */


/* Must come before any kernel header file --hpreg */
#include "driver-config.h"

/* Must come before vmware.h --hpreg */
#include "compat_timer.h"
#include <linux/binfmts.h>
#include <linux/delay.h>
#include <linux/file.h>
#include <linux/kernel.h>

#include <linux/vmalloc.h>
#include <linux/slab.h>

#include <linux/preempt.h>
#include <linux/poll.h>
#include <linux/mman.h>

#include <linux/smp.h>

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
#   include <asm/asm.h>
#endif
#if defined(_ASM_EXTABLE)
#   define VMW_ASM_EXTABLE(from, to) _ASM_EXTABLE(from, to)
#else
    /* Compat version copied from asm.h of 2.6.25 kernel */
#   define VMW_ASM_FORM(x)  " " #x " "
#   define VMW_ASM_EX_SEC   " .section __ex_table,\"a\"\n"
#   ifdef CONFIG_X86_32
#      define VMW_ASM_SEL(a,b) VMW_ASM_FORM(a)
#   else
#      define VMW_ASM_SEL(a,b) VMW_ASM_FORM(b)
#   endif
#   define VMW_ASM_PTR        VMW_ASM_SEL(.long, .quad)
#   define VMW_ASM_ALIGN      VMW_ASM_SEL(.balign 4, .balign 8)
#   define VMW_ASM_EXTABLE(from,to) \
           VMW_ASM_EX_SEC    \
           VMW_ASM_ALIGN "\n" \
           VMW_ASM_PTR #from "," #to "\n" \
           " .previous\n"
#endif

#include <asm/io.h>
#include <asm/uaccess.h>
#include <linux/mc146818rtc.h>
#include <linux/capability.h>
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/signal.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
#include <linux/taskstats_kern.h> // For linux/sched/signal.h without version check
#endif

#include "vmware.h"
#include "x86apic.h"
#include "vm_asm.h"
#include "modulecall.h"
#include "driver.h"
#include "memtrack.h"
#include "phystrack.h"
#include "cpuid.h"
#include "cpuid_info.h"
#include "hostif.h"
#include "hostif_priv.h"
#include "vmhost.h"
#include "x86msr.h"
#include "apic.h"
#include "memDefaults.h"
#include "vcpuid.h"

#include "pgtbl.h"
#include "vmmonInt.h"
#include "versioned_atomic.h"

#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0)

#   define global_zone_page_state global_page_state

#endif

 

static unsigned long get_nr_slab_unreclaimable(void)

{

#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0)

   return global_node_page_state(NR_SLAB_UNRECLAIMABLE);

#else

   return global_page_state(NR_SLAB_UNRECLAIMABLE);

#endif

}

 

static unsigned long get_nr_unevictable(void)

{

#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0)

   return global_node_page_state(NR_UNEVICTABLE);

#else

   return global_page_state(NR_UNEVICTABLE);

#endif

}

 

static unsigned long get_nr_anon_mapped(void)

{

#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0)

   return global_node_page_state(NR_ANON_MAPPED);

#else

   return global_page_state(NR_ANON_PAGES);

#endif

}

/*
 * Determine if we can use high resolution timers.
 */

#ifdef CONFIG_HIGH_RES_TIMERS
#   include <linux/hrtimer.h>
#   define VMMON_USE_HIGH_RES_TIMERS
#   if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
#      define VMMON_USE_SCHEDULE_HRTIMEOUT
#   else
#      define VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT
static void HostIFWakeupClockThread(unsigned long data);
static DECLARE_TASKLET(timerTasklet, HostIFWakeupClockThread, 0);
#   endif
#   define close_rtc(filp, files) do {} while(0)
#else
#   define close_rtc(filp, files) filp_close(filp, files)
#endif

#define UPTIME_FREQ CONST64(1000000)

/*
 * When CONFIG_NO_HZ_FULL is set processors can run tickless
 * if there is only one runnable process.  When set, the rate
 * checks in HostIF_SetFastClockRate and HostIFFastClockThread
 * need to be relaxed to allow any non-zero rate to run.
 *
 * This code can potentially be removed if/when we stop using
 * HostIFFastClockThread to drive MonTimer.  See PR1088247.
 */
#ifdef CONFIG_NO_HZ_FULL
#define MIN_RATE (0)
#else
#define MIN_RATE ((HZ) + (HZ) / 16)
#endif

/*
 * Linux seems to like keeping free memory around 30MB
 * even under severe memory pressure.  Let's give it a little
 * more leeway than that for safety.
 */
#define LOCKED_PAGE_SLACK 10000

static struct {
   Atomic_uint64     uptimeBase;
   VersionedAtomic   version;
   uint64            monotimeBase;
   unsigned long     jiffiesBase;
   struct timer_list timer;
} uptimeState;

/*
 * First Page Locking strategy
 * ---------------------------
 *
 * An early implementation hacked the lock bit for the purpose of locking
 * memory. This had a couple of advantages:
 *   - the vmscan algorithm would never eliminate mappings from the process
 *     address space
 *   - easy to assert that things are ok
 *   - it worked with anonymous memory. Basically, vmscan jumps over these
 *     pages, their use count stays high, ....
 *
 * This approach however had a couple of problems:
 *
 *   - it relies on an undocumented interface. (in another words, a total hack)
 *   - it creates deadlock situations if the application gets a kill -9 or
 *     otherwise dies ungracefully. linux first tears down the address space,
 *     then closes file descriptors (including our own device). Unfortunately,
 *     this leads to a deadlock of the process on pages with the lock bit set.
 *
 *     There is a workaround for that, namely to detect that condition using
 *     a linux timer. (ugly)
 *
 * Current Page Locking strategy
 * -----------------------------
 *
 * The current scheme does not use the lock bit, rather it increments the use
 * count on the pages that need to be locked down in memory.
 *
 * The problem is that experiments on certain linux systems (e.g. 2.2.0-pre9)
 * showed that linux somehow swaps out anonymous pages, even with the
 * increased ref counter.
 * Swapping them out to disk is not that big of a deal, but bringing them back
 * to a different location is.  In any case, anonymous pages in linux are not
 * intended to be write-shared (e.g. try to MAP_SHARED /dev/zero).
 *
 * As a result, the current locking strategy requires that all locked pages are
 * backed by the filesystem, not by swap. For now, we use both mapped files and
 * sys V shared memory. The user application is responsible to cover these
 * cases.
 *
 */


#define HOST_UNLOCK_PFN(_vm, _pfn) do {                  \
   _vm = _vm;                                            \
   put_page(pfn_to_page(_pfn));                          \
} while (0)

#define HOST_UNLOCK_PFN_BYMPN(_vm, _pfn) do {            \
   PhysTrack_Remove((_vm)->vmhost->lockedPages, (_pfn)); \
   put_page(pfn_to_page(_pfn));                          \
} while (0)

uint8 monitorIPIVector;
uint8 hvIPIVector;

/*
 *-----------------------------------------------------------------------------
 *
 * MutexInit --
 *
 *      Initialize a Mutex. --hpreg
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

#ifdef VMX86_DEBUG
static INLINE void
MutexInit(Mutex *mutex,     // IN
          char const *name) // IN
{
   ASSERT(mutex);
   ASSERT(name);

   sema_init(&mutex->sem, 1);
   mutex->name = name;
   mutex->cur.pid = -1;
}
#else
#   define MutexInit(_mutex, _name) sema_init(&(_mutex)->sem, 1)
#endif


#ifdef VMX86_DEBUG
/*
 *-----------------------------------------------------------------------------
 *
 * MutexIsLocked --
 *
 *      Determine if a Mutex is locked by the current thread. --hpreg
 *
 * Results:
 *      TRUE if yes
 *      FALSE if no
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

static INLINE Bool
MutexIsLocked(Mutex *mutex) // IN
{
   ASSERT(mutex);

   return mutex->cur.pid == current->pid;
}
#endif


/*
 *-----------------------------------------------------------------------------
 *
 * MutexLock --
 *
 *      Acquire a Mutex. --hpreg
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

#ifdef VMX86_DEBUG
static INLINE void
MutexLock(Mutex *mutex, // IN
          int callerID) // IN
{
   ASSERT(mutex);
   ASSERT(!MutexIsLocked(mutex));

   down(&mutex->sem);
   mutex->cur.pid = current->pid;
   mutex->cur.callerID = callerID;
}
#else
#   define MutexLock(_mutex, _callerID) down(&(_mutex)->sem)
#endif


/*
 *-----------------------------------------------------------------------------
 *
 * MutexUnlock --
 *
 *      Release a Mutex. --hpreg
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

#ifdef VMX86_DEBUG
static INLINE void
MutexUnlock(Mutex *mutex, // IN
            int callerID) // IN
{
   ASSERT(mutex);

   ASSERT(MutexIsLocked(mutex) && mutex->cur.callerID == callerID);
   mutex->prev = mutex->cur;
   mutex->cur.pid = -1;
   up(&mutex->sem);
}
#else
#   define MutexUnlock(_mutex, _callerID) up(&(_mutex)->sem)
#endif


/* This mutex protects the driver-wide state. --hpreg */
static Mutex globalMutex;

/*
 * This mutex protects the fast clock rate and is held while
 * creating/destroying the fastClockThread.  It ranks below
 * globalMutex.  We can't use globalMutex for this purpose because the
 * fastClockThread itself acquires the globalMutex, so trying to hold
 * the mutex while destroying the thread can cause a deadlock.
 */
static Mutex fastClockMutex;

/* This mutex protects linuxState.pollList.  */
static Mutex pollListMutex;


/*
 *----------------------------------------------------------------------
 *
 * HostIF_PrepareWaitForThreads --
 *
 *      Prepare to wait for another vCPU thread.
 *
 * Results:
 *      FALSE: no way on Linux to determine we've already been signalled.
 *
 * Side effects:
 *      Current task is interruptible.
 *
 *----------------------------------------------------------------------
 */

Bool
HostIF_PrepareWaitForThreads(VMDriver *vm,     // IN:
                             Vcpuid currVcpu)  // IN:
{
   set_current_state(TASK_INTERRUPTIBLE);
   vm->vmhost->vcpuSemaTask[currVcpu] = current;
   return FALSE;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_WaitForThreads --
 *
 *      Wait for another vCPU thread.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      Current task may block.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_WaitForThreads(VMDriver *vm,     // UNUSED:
                      Vcpuid currVcpu)  // UNUSED:

{
#ifdef VMMON_USE_SCHEDULE_HRTIMEOUT
   ktime_t timeout = ktime_set(0, CROSSCALL_SLEEP_US * 1000);
   schedule_hrtimeout(&timeout, HRTIMER_MODE_REL);
#else
   /* Fallback to ms timer resolution is fine for older kernels. */
   schedule_timeout(msecs_to_jiffies(CROSSCALL_SLEEP_US / 1000) + 1);
#endif
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_CancelWaitForThreads --
 *
 *      Cancel waiting for another vCPU thread.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      Current task is running and no longer interruptible.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_CancelWaitForThreads(VMDriver *vm,     // IN:
                            Vcpuid currVcpu)  // IN:
{
   vm->vmhost->vcpuSemaTask[currVcpu] = NULL;
   set_current_state(TASK_RUNNING);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_WakeUpYielders --
 *
 *      Wakeup vCPUs that are waiting for the current vCPU.
 *      
 * Results:
 *      The requested vCPUs are nudged if they are sleeping due to
 *      Vmx86_YieldToSet.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_WakeUpYielders(VMDriver *vm,     // IN:
                      Vcpuid currVcpu)  // IN:
{
   VCPUSet req;
   Vcpuid vcpuid;
   uint64 subset;

   /*
    * PR 1142958: if the VCPUs woken in the crosscallWaitSet re-add themselves
    * to this set faster than it can be fully drained, this function never
    * exits.  Instead, we copy and remove a snapshot of the crosscallWaitSet
    * and locally wake up just that snapshot.  It is ok that we don't get a
    * fully coherent snapshot, as long as the subset copy-and-remove is atomic
    * so no VCPU added is lost entirely.
    */

   VCPUSet_Empty(&req);
   FOR_EACH_SUBSET_IN_SET(subIdx) {
      subset = VCPUSet_AtomicReadWriteSubset(&vm->crosscallWaitSet[currVcpu],
                                             0, subIdx);
      VCPUSet_UnionSubset(&req, subset, subIdx);
   } ROF_EACH_SUBSET_IN_SET();

   preempt_disable();
   while ((vcpuid = VCPUSet_FindFirst(&req)) != VCPUID_INVALID) {
      struct task_struct *t = vm->vmhost->vcpuSemaTask[vcpuid];
      VCPUSet_Remove(&req, vcpuid);
      if (t && (t->state & TASK_INTERRUPTIBLE)) {
         wake_up_process(t);
      }
   }
   preempt_enable();
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_InitGlobalLock --
 *
 *      Initialize the global (across all VMs and vmmon) locks.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_InitGlobalLock(void)
{
   MutexInit(&globalMutex, "global");
   MutexInit(&fastClockMutex, "fastClock");
   MutexInit(&pollListMutex, "pollList");
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_GlobalLock --
 *
 *      Grabs the global data structure lock.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Should be a very low contention lock. 
 *      The current thread is rescheduled if the lock is busy.
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_GlobalLock(int callerID) // IN
{
   MutexLock(&globalMutex, callerID);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_GlobalUnlock --
 *
 *      Releases the global data structure lock.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_GlobalUnlock(int callerID) // IN
{
   MutexUnlock(&globalMutex, callerID);
}


#ifdef VMX86_DEBUG
/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_GlobalLockIsHeld --
 *
 *      Determine if the global lock is held by the current thread.
 * 
 * Results:
 *      TRUE if yes
 *      FALSE if no
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

Bool
HostIF_GlobalLockIsHeld(void)
{
   return MutexIsLocked(&globalMutex);
}
#endif


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_FastClockLock --
 *
 *      Grabs the fast clock data structure lock.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Should be a very low contention lock. 
 *      The current thread is rescheduled if the lock is busy.
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_FastClockLock(int callerID) // IN
{
   MutexLock(&fastClockMutex, callerID);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_FastClockUnlock --
 *
 *      Releases the fast clock data structure lock.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_FastClockUnlock(int callerID) // IN
{
   MutexUnlock(&fastClockMutex, callerID);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_PollListLock --
 *
 *      Grabs the linuxState.pollList lock.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      The current thread is rescheduled if the lock is busy.
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_PollListLock(int callerID) // IN
{
   MutexLock(&pollListMutex, callerID);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_PollListUnlock --
 *
 *      Releases the linuxState.pollList lock.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_PollListUnlock(int callerID) // IN
{
   MutexUnlock(&pollListMutex, callerID);
}


/*
 *----------------------------------------------------------------------
 *
 * MapCrossPage & UnmapCrossPage
 *
 *    Both x86-64 and ia32 need to map crosspage to an executable
 *    virtual address. We use the vmap interface instead of kmap
 *    due to bug 43907.
 *
 * Side effects:
 *
 *    UnmapCrossPage assumes that the page has been refcounted up
 *    so it takes care of the put_page.
 *
 *----------------------------------------------------------------------
 */
static void *
MapCrossPage(struct page *p)  // IN:
{
   return vmap(&p, 1, VM_MAP, VM_PAGE_KERNEL_EXEC);
}


static void
UnmapCrossPage(struct page *p,  // IN:
               void *va)        // IN:
{
   vunmap(va);
   put_page(p);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIFHostMemInit --
 *
 *      Initialize per-VM pages lists.
 *
 * Results:
 *      0 on success,
 *      non-zero on failure.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

static int
HostIFHostMemInit(VMDriver *vm)  // IN:
{
   VMHost *vmh = vm->vmhost;
   
   vmh->lockedPages = PhysTrack_Alloc(vm);
   if (!vmh->lockedPages) {
      return -1;
   }
   vmh->AWEPages = PhysTrack_Alloc(vm);
   if (!vmh->AWEPages) {
      return -1;
   }

   return 0;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIFHostMemCleanup --
 *
 *      Release per-VM pages lists.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      Locked and AWE pages are released.
 *
 *----------------------------------------------------------------------
 */

static void
HostIFHostMemCleanup(VMDriver *vm)  // IN:
{
   MPN mpn;
   VMHost *vmh = vm->vmhost;

   if (!vmh) {
      return;
   }

   HostIF_VMLock(vm, 32); // Debug version of PhysTrack wants VM's lock.
   if (vmh->lockedPages) {
      for (mpn = 0;
           INVALID_MPN != (mpn = PhysTrack_GetNext(vmh->lockedPages, mpn));) {
         HOST_UNLOCK_PFN_BYMPN(vm, mpn);
      }
      PhysTrack_Free(vmh->lockedPages);
      vmh->lockedPages = NULL;
   }

   if (vmh->AWEPages) {
      for (mpn = 0;
           INVALID_MPN != (mpn = PhysTrack_GetNext(vmh->AWEPages, mpn));) {
         PhysTrack_Remove(vmh->AWEPages, mpn);
         put_page(pfn_to_page(mpn));
      }
      PhysTrack_Free(vmh->AWEPages);
      vmh->AWEPages = NULL;
   }
   HostIF_VMUnlock(vm, 32);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_AllocMachinePage --
 *
 *      Alloc non-swappable memory page. The page is not billed to
 *      a particular VM. Preferably the page should not be mapped into
 *      the kernel addresss space.
 *
 * Results:
 *      INVALID_MPN or a valid host mpn.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

MPN
HostIF_AllocMachinePage(void)
{
  struct page *pg = alloc_page(GFP_HIGHUSER);

  return (pg) ? ((MPN)page_to_pfn(pg)) : INVALID_MPN;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_FreeMachinePage --
 *
 *      Free an anonymous machine page allocated by 
 *      HostIF_AllocMachinePage().  This page is not tracked in any 
 *      phystracker.
 *
 * Results:
 *      Host page is unlocked.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_FreeMachinePage(MPN mpn)  // IN:
{
  struct page *pg = pfn_to_page(mpn);

  __free_page(pg);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_AllocLockedPages --
 *
 *      Alloc non-swappable memory.
 *
 * Results:
 *      negative value on complete failure
 *      non-negative value on partial/full completion, number of MPNs
 *          allocated & filled in pmpn returned.
 *
 * Side effects:
 *      Pages allocated.
 *
 *----------------------------------------------------------------------
 */

int
HostIF_AllocLockedPages(VMDriver *vm,	     // IN: VM instance pointer
			VA64 addr,	     // OUT: pointer to user or kernel buffer for MPNs
			unsigned numPages,   // IN: number of pages to allocate
			Bool kernelMPNBuffer)// IN: is the MPN buffer in kernel or user address space?
{
   MPN *pmpn = VA64ToPtr(addr);

   VMHost *vmh = vm->vmhost;
   unsigned int cnt;
   int err = 0;

   if (!vmh || !vmh->AWEPages) {
      return -EINVAL;
   }
   for (cnt = 0; cnt < numPages; cnt++) {
      struct page* pg;
      MPN mpn;

      pg = alloc_page(GFP_HIGHUSER);
      if (!pg) {
         err = -ENOMEM;
	 break;
      }
      mpn = (MPN)page_to_pfn(pg);
      if (kernelMPNBuffer) {
         *pmpn = mpn;
      } else if (HostIF_CopyToUser(pmpn, &mpn, sizeof *pmpn) != 0) {
	__free_page(pg);
	err = -EFAULT;
	break;
      }
      pmpn++;
      if (PhysTrack_Test(vmh->AWEPages, mpn)) {
	Warning("%s: duplicate MPN %016" FMT64 "x\n", __func__, mpn);
      }
      PhysTrack_Add(vmh->AWEPages, mpn);
   }

   return cnt ? cnt : err;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_FreeLockedPages --
 *
 *      Free non-swappable memory.
 *
 * Results:
 *      On success: 0. All pages were unlocked.
 *      On failure: Non-zero system error code. No page was unlocked.
 *
 * Side effects:
 *      Pages freed.
 *
 *----------------------------------------------------------------------
 */

int
HostIF_FreeLockedPages(VMDriver *vm,	     // IN: VM instance pointer
		       VA64 addr,            // IN: user or kernel array of MPNs
		       unsigned numPages,    // IN: number of pages to free
		       Bool kernelMPNBuffer) // IN: is the MPN buffer in kernel or user address space?
{
   const int MPN_BATCH = 64;
   MPN const *pmpn = VA64ToPtr(addr);
   VMHost *vmh = vm->vmhost;
   unsigned int cnt;
   struct page *pg;
   MPN *mpns;

   mpns = HostIF_AllocKernelMem(sizeof *mpns * MPN_BATCH, TRUE);

   if (mpns == NULL) {
      return -ENOMEM;
   }
   if (!vmh || !vmh->AWEPages) {
      HostIF_FreeKernelMem(mpns);
      return -EINVAL;
   }

   if (!kernelMPNBuffer) {
      if (numPages > MPN_BATCH) {
         HostIF_FreeKernelMem(mpns);
         return -EINVAL;
      }

      if (HostIF_CopyFromUser(mpns, pmpn, numPages * sizeof *pmpn)) {
         printk(KERN_DEBUG "Cannot read from process address space at %p\n",
                pmpn);
         HostIF_FreeKernelMem(mpns);
         return -EINVAL;
      }

      pmpn = mpns;
   }

   for (cnt = 0; cnt < numPages; cnt++) {
      if (!PhysTrack_Test(vmh->AWEPages, pmpn[cnt])) {
         printk(KERN_DEBUG "Attempted to free unallocated MPN %016" FMT64 "X\n",
                pmpn[cnt]);
         HostIF_FreeKernelMem(mpns);
         return -EINVAL;
      }

      pg = pfn_to_page(pmpn[cnt]);
      if (page_count(pg) != 1) {
         // should this case be considered a failure?
         printk(KERN_DEBUG "Page %016" FMT64 "X is still used by someone "
                "(use count %u, VM %p)\n", pmpn[cnt],
                 page_count(pg), vm);
      }
   }

   for (cnt = 0; cnt < numPages; cnt++) {
      pg = pfn_to_page(pmpn[cnt]);
      PhysTrack_Remove(vmh->AWEPages, pmpn[cnt]);
      __free_page(pg);
   }
   HostIF_FreeKernelMem(mpns);
   return 0;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_Init --
 *
 *      Initialize the host-dependent part of the driver.
 *
 * Results:
 *     zero on success, non-zero on error.
 *
 * Side effects:
 *     None
 *
 *----------------------------------------------------------------------
 */

int
HostIF_Init(VMDriver *vm)  // IN:
{
   vm->memtracker = MemTrack_Init(vm);
   if (vm->memtracker == NULL) {
      return -1;
   }

   vm->vmhost = (VMHost *) HostIF_AllocKernelMem(sizeof *vm->vmhost, TRUE);
   if (vm->vmhost == NULL) {
      return -1;
   }
   memset(vm->vmhost, 0, sizeof *vm->vmhost);

   if (HostIFHostMemInit(vm)) {
      return -1;
   }
   MutexInit(&vm->vmhost->vmMutex, "vm");

   return 0;
}


/*
 *------------------------------------------------------------------------------
 *
 * HostIF_LookupUserMPN --
 *
 *      Lookup the MPN of a locked user page by user VA.
 *
 * Results:
 *      A status code and the MPN on success.
 *
 * Side effects:
 *     None
 *
 *------------------------------------------------------------------------------
 */

int
HostIF_LookupUserMPN(VMDriver *vm, // IN: VMDriver
                     VA64 uAddr,   // IN: user VA of the page
                     MPN *mpn)     // OUT
{
   void *uvAddr = VA64ToPtr(uAddr);
   int retval = PAGE_LOCK_SUCCESS;

   *mpn = PgtblVa2MPN((VA)uvAddr);

   /*
    * On failure, check whether the page is locked.
    *
    * While we don't require the page to be locked by HostIF_LockPage(),
    * it does provide extra information.
    *
    * -- edward
    */
   if (*mpn == INVALID_MPN) {
      if (vm == NULL) {
         retval += PAGE_LOOKUP_NO_VM;
      } else {
         MemTrackEntry *entryPtr =
            MemTrack_LookupVPN(vm->memtracker, PTR_2_VPN(uvAddr));
         if (entryPtr == NULL) {
            retval += PAGE_LOOKUP_NOT_TRACKED;
         } else if (entryPtr->mpn == 0) {
            retval += PAGE_LOOKUP_NO_MPN;
         } else {
            /*
             * Kernel can remove PTEs/PDEs from our pagetables even if pages
             * are locked...
             */
            volatile int c;

            get_user(c, (char *)uvAddr);
            *mpn = PgtblVa2MPN((VA)uvAddr);
            if (*mpn == entryPtr->mpn) {
#ifdef VMX86_DEBUG
               printk(KERN_DEBUG "Page %p disappeared from %s(%u)... "
                      "now back at %016" FMT64 "x\n",
                      uvAddr, current->comm, current->pid, *mpn);
#endif
            } else if (*mpn != INVALID_MPN) {
               printk(KERN_DEBUG "Page %p disappeared from %s(%u)... "
                      "now back at %016" FMT64"x (old=%016" FMT64 "x)\n",
                      uvAddr, current->comm, current->pid, *mpn,
                      entryPtr->mpn);
               *mpn = INVALID_MPN;
            } else {
               printk(KERN_DEBUG "Page %p disappeared from %s(%u)... "
                      "and is lost (old=%016" FMT64 "x)\n", uvAddr, current->comm,
                      current->pid, entryPtr->mpn);
               *mpn = entryPtr->mpn;
            }
         }
      }
   }

   return retval;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_InitFP --
 *
 *      masks IRQ13 if not previously the case.
 *
 * Results:
 *      prevents INTR #0x2d (IRQ 13) from being generated --
 *      assume that Int16 works for interrupt reporting
 *      
 *
 * Side effects:
 *      PIC
 *
 *----------------------------------------------------------------------
 */

void
HostIF_InitFP(VMDriver *vm)  // IN:
{
   int mask = (1 << (0xD - 0x8));

   uint8 val = inb(0xA1);

   if (!(val & mask)) { 
      val = val | mask;
      outb(val, 0xA1);
   }
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIFGetUserPages --
 *
 *      Lock the pages of an user-level address space in memory.
 *      If ppages is NULL, pages are only marked as dirty.
 *
 * Results:
 *      Zero on success, non-zero on failure. 
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

static int
HostIFGetUserPages(void *uvAddr,          // IN
                   struct page **ppages,  // OUT
                   unsigned int numPages) // IN
{
   int retval;

   down_read(&current->mm->mmap_sem);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)
   retval = get_user_pages((unsigned long)uvAddr, numPages, 0, ppages, NULL);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
   retval = get_user_pages((unsigned long)uvAddr, numPages, 0, 0, ppages, NULL);
#else
   retval = get_user_pages(current, current->mm, (unsigned long)uvAddr,
                           numPages, 0, 0, ppages, NULL);
#endif
   up_read(&current->mm->mmap_sem);

   return retval != numPages;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_IsLockedByMPN --
 *
 *      Checks if mpn was locked using allowMultipleMPNsPerVA.  
 *
 * Results:
 *      TRUE if mpn is present in the physTracker.
 *      
 *
 * Side effects:
 *     None.
 *
 *----------------------------------------------------------------------
 */

Bool
HostIF_IsLockedByMPN(VMDriver *vm,  // IN:
                     MPN mpn)       // IN:
{
  return PhysTrack_Test(vm->vmhost->lockedPages, mpn);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_LockPage --
 *
 *     Lockup the MPN of an pinned user-level address space
 *
 * Results:
 *     A PAGE_LOCK_* status code and the MPN on success.
 *
 * Side effects:
 *      Adds the page to the MemTracker, if allowMultipleMPNsPerVA then the page
 *      is added to the VM's PhysTracker.
 *
 *-----------------------------------------------------------------------------
 */

int
HostIF_LockPage(VMDriver *vm,                // IN: VMDriver
                VA64 uAddr,                  // IN: user VA of the page
                Bool allowMultipleMPNsPerVA, // IN: allow to lock many pages per VA
                MPN *mpn)                    // OUT: pinned page
{
   void *uvAddr = VA64ToPtr(uAddr);
   struct page *page;
   VPN vpn;
   MemTrackEntry *entryPtr = NULL;

   vpn = PTR_2_VPN(uvAddr);
   if (!allowMultipleMPNsPerVA) {
      entryPtr = MemTrack_LookupVPN(vm->memtracker, vpn);

      /*
       * Already tracked and locked
       */

      if (entryPtr != NULL && entryPtr->mpn != 0) {
         return PAGE_LOCK_ALREADY_LOCKED;
      }
   }

   if (HostIFGetUserPages(uvAddr, &page, 1)) {
      return PAGE_LOCK_FAILED;
   }

   *mpn = (MPN)page_to_pfn(page);

   if (allowMultipleMPNsPerVA) {
      /*
       *  Add the MPN to the PhysTracker that tracks locked pages.
       */

      struct PhysTracker* const pt = vm->vmhost->lockedPages;

      if (PhysTrack_Test(pt, *mpn)) {
         put_page(page);
         return PAGE_LOCK_ALREADY_LOCKED;
      }
      PhysTrack_Add(pt, *mpn);
   } else {
      /*
       * If the entry doesn't exist, add it to the memtracker
       * otherwise we just update the mpn.
       */

      if (entryPtr == NULL) {
         entryPtr = MemTrack_Add(vm->memtracker, vpn, *mpn);
         if (entryPtr == NULL) {
            HOST_UNLOCK_PFN(vm, *mpn);
            return PAGE_LOCK_MEMTRACKER_ERROR;
         }
      } else {
         entryPtr->mpn = *mpn;
      }
   }

   return PAGE_LOCK_SUCCESS;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_UnlockPage --
 *
 *      Unlock an pinned user-level page.
 *
 * Results:
 *      Status PAGE_UNLOCK_* code.
 *
 * Side effects:
 *     None
 *
 *----------------------------------------------------------------------
 */

int
HostIF_UnlockPage(VMDriver *vm,  // IN:
                  VA64 uAddr)    // IN:
{
   void *addr = VA64ToPtr(uAddr);
   VPN vpn;
   MemTrackEntry *e;

   vpn = VA_2_VPN((VA)addr);
   e = MemTrack_LookupVPN(vm->memtracker, vpn);
    
   if (e == NULL) {
      return PAGE_UNLOCK_NOT_TRACKED;
   }
   if (e->mpn == 0) {
      return PAGE_UNLOCK_NO_MPN;
   }

   HOST_UNLOCK_PFN(vm, e->mpn);
   e->mpn = 0;

   return PAGE_UNLOCK_SUCCESS;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_UnlockPageByMPN --
 *
 *      Unlock a locked user mode page. The page doesn't need to be mapped
 *      anywhere.
 *
 * Results:
 *      Status code. Returns a PAGE_LOOKUP_* error if the page can't be found or
 *      a PAGE_UNLOCK_* error if the page can't be unlocked.
 *
 * Side effects:
 *     Removes the MPN from from VM's PhysTracker.
 *
 *----------------------------------------------------------------------
 */

int
HostIF_UnlockPageByMPN(VMDriver *vm, // IN: VMDriver
                       MPN mpn,      // IN: the MPN to unlock
                       VA64 uAddr)   // IN: optional(debugging) VA for the MPN
{
   if (!PhysTrack_Test(vm->vmhost->lockedPages, mpn)) {
      return PAGE_UNLOCK_NO_MPN;
   }

#ifdef VMX86_DEBUG
   {
      void *va = VA64ToPtr(uAddr);
      MemTrackEntry *e;
      
      /*
       * Verify for debugging that VA and MPN make sense.
       * PgtblVa2MPN() can fail under high memory pressure.
       */

      if (va != NULL) {
         MPN lookupMpn = PgtblVa2MPN((VA)va);

         if (lookupMpn != INVALID_MPN && mpn != lookupMpn) {
            Warning("Page lookup fail %#"FMT64"x %016" FMT64 "x %p\n",
                    mpn, lookupMpn, va);

            return PAGE_LOOKUP_INVALID_ADDR;
         }
      }

      /*
       * Verify that this MPN was locked with 
       * HostIF_LockPage(allowMultipleMPNsPerVA = TRUE).
       * That means that this MPN should not be in the MemTracker.
       */

      e = MemTrack_LookupMPN(vm->memtracker, mpn);
      if (e) {
         Warning("%s(): mpn=%#"FMT64"x va=%p was permanently locked with "
                 "vpn=0x%"FMT64"x\n", __func__, mpn, va, e->vpn);

         return PAGE_UNLOCK_MISMATCHED_TYPE;
      }
   }
#endif 

   HOST_UNLOCK_PFN_BYMPN(vm, mpn);

   return PAGE_UNLOCK_SUCCESS;
}


static void 
UnlockEntry(void *clientData,         // IN:
            MemTrackEntry *entryPtr)  // IN:
{
   VMDriver *vm = (VMDriver *)clientData;

   if (entryPtr->mpn) {
      HOST_UNLOCK_PFN(vm,entryPtr->mpn);
      entryPtr->mpn = 0;
   }
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_FreeAllResources --
 *
 *      Free all host-specific VM resources.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_FreeAllResources(VMDriver *vm) // IN
{
   unsigned int cnt;

   HostIFHostMemCleanup(vm);
   if (vm->memtracker) {
      MemTrack_Cleanup(vm->memtracker, UnlockEntry, vm);
      vm->memtracker = NULL;
   }
   if (vm->vmhost) {
      for (cnt = vm->vmhost->crosspagePagesCount; cnt > 0; ) {
         struct page* p = vm->vmhost->crosspagePages[--cnt];
         UnmapCrossPage(p, vm->crosspage[cnt]);
      }
      vm->vmhost->crosspagePagesCount = 0;
      if (vm->vmhost->hostAPICIsMapped) {
	 ASSERT(vm->hostAPIC.base != NULL);
	 iounmap((void*)vm->hostAPIC.base);
	 vm->hostAPIC.base = NULL;
	 vm->vmhost->hostAPICIsMapped = FALSE;
      }
      HostIF_FreeKernelMem(vm->vmhost);
      vm->vmhost = NULL;
   }
}



/*
 *----------------------------------------------------------------------
 *
 * HostIF_AllocKernelMem
 *
 *      Allocate some kernel memory for the driver. 
 *
 * Results:
 *      The address allocated or NULL on error. 
 *      
 *
 * Side effects:
 *      memory is malloced
 *----------------------------------------------------------------------
 */

void *
HostIF_AllocKernelMem(size_t size,  // IN:
                      int wired)    // IN:
{
   void * ptr = kmalloc(size, GFP_KERNEL);
   
   if (ptr == NULL) { 
      Warning("%s failed (size=%p)\n", __func__, (void*)size);
   }

   return ptr;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_AllocPage --
 *
 *    Allocate a page (whose content is undetermined)
 *
 * Results:
 *    The kernel virtual address of the page
 *
 * Side effects:
 *    None
 *
 *-----------------------------------------------------------------------------
 */

void *
HostIF_AllocPage(void)
{
   VA kvAddr;
   
   kvAddr = __get_free_page(GFP_KERNEL);
   if (kvAddr == 0) {
      Warning("%s: __get_free_page() failed\n", __func__);
   }

   return (void *)kvAddr;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_FreeKernelMem
 *
 *      Free kernel memory allocated for the driver. 
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      memory is freed.
 *----------------------------------------------------------------------
 */

void
HostIF_FreeKernelMem(void *ptr)  // IN:
{
   kfree(ptr);
}


void
HostIF_FreePage(void *ptr)  // IN:
{
   VA vAddr = (VA)ptr;

   if (vAddr & (PAGE_SIZE-1)) {
      Warning("%s %p misaligned\n", __func__, (void*)vAddr);
   } else {
      free_page(vAddr);
   }
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_EstimateLockedPageLimit --
 *
 *      Estimates how many memory pages can be locked or allocated
 *      from the kernel without causing the host to die or to be really upset.
 *
 * Results:
 *	The maximum number of pages that can be locked. 
 *
 * Side effects:
 *      none
 *
 *----------------------------------------------------------------------
 */

unsigned int
HostIF_EstimateLockedPageLimit(const VMDriver* vm,                // IN
			       unsigned int currentlyLockedPages) // IN
{
   /*
    * This variable is available and exported to modules,
    * since at least 2.6.0.
    */

   extern unsigned long totalram_pages;

   unsigned int totalPhysicalPages = totalram_pages;

#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
   return MemDefaults_CalcMaxLockedPages(totalPhysicalPages);
#else
   /*
    * Use the memory information linux exports as of late for a more
    * precise estimate of locked memory.  All kernel page-related structures
    * (slab, pagetable) are as good as locked.  Unevictable includes things
    * that are explicitly marked as such (like mlock()).  Huge pages are 
    * also as good as locked, since we don't use them.  Lastly, without 
    * available swap, anonymous pages become locked in memory as well. 
    */

   unsigned int forHost;
   unsigned int reservedPages = MEMDEFAULTS_MIN_HOST_PAGES;
   unsigned int hugePages = (vm == NULL) ? 0 :
      BYTES_2_PAGES(vm->memInfo.hugePageBytes);
   unsigned int lockedPages = global_zone_page_state(NR_PAGETABLE) +
      get_nr_slab_unreclaimable() +
      get_nr_unevictable() +
      hugePages + reservedPages;
   unsigned int anonPages = get_nr_anon_mapped();
   unsigned int swapPages = BYTES_2_PAGES(linuxState.swapSize);

   if (anonPages > swapPages) {
      lockedPages += anonPages - swapPages; 
   }
   forHost = lockedPages + LOCKED_PAGE_SLACK;
   if (forHost > totalPhysicalPages) {
      forHost = totalPhysicalPages;
   }

   return totalPhysicalPages - forHost;
#endif
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_Wait --
 *
 *      Waits for specified number of milliseconds.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_Wait(unsigned int timeoutMs)
{
   msleep_interruptible(timeoutMs);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_WaitForFreePages --
 *
 *      Waits for pages to be available for allocation or locking.
 *
 * Results:
 *	New pages are likely to be available for allocation or locking.
 *
 * Side effects:
 *      none
 *
 *----------------------------------------------------------------------
 */

void 
HostIF_WaitForFreePages(unsigned int timeoutMs)  // IN:
{
   static unsigned count;
   msleep_interruptible(timeoutMs);
   count++;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIFReadUptimeWork --
 *
 *      Reads the current uptime.  The uptime is based on getimeofday,
 *      which provides the needed high resolution.  However, we don't
 *      want uptime to be warped by e.g. calls to settimeofday.  So, we
 *      use a jiffies based monotonic clock to sanity check the uptime.
 *      If the uptime is more than one second from the monotonic time,
 *      we assume that the time of day has been set, and recalculate the
 *      uptime base to get uptime back on track with monotonic time.  On
 *      the other hand, we do expect jiffies based monotonic time and
 *      timeofday to have small drift (due to NTP rate correction, etc).
 *      We handle this by rebasing the jiffies based monotonic clock
 *      every second (see HostIFUptimeResyncMono).
 *      
 * Results:
 *      The uptime, in units of UPTIME_FREQ.  Also returns the jiffies
 *      value that was used in the monotonic time calculation.
 *
 * Side effects:
 *      May reset the uptime base in the case gettimeofday warp was 
 *      detected.
 *
 *----------------------------------------------------------------------
 */

static uint64
HostIFReadUptimeWork(unsigned long *j)  // OUT: current jiffies 
{
   struct timeval tv;
   uint64 monotime, uptime, upBase, monoBase;
   int64 diff;
   uint32 version;
   unsigned long jifs, jifBase;
   unsigned int attempts = 0;

   /* Assert that HostIF_InitUptime has been called. */
   ASSERT(uptimeState.timer.function);

 retry:
   do {
      version  = VersionedAtomic_BeginTryRead(&uptimeState.version);
      jifs     = jiffies;
      jifBase  = uptimeState.jiffiesBase;
      monoBase = uptimeState.monotimeBase;
   } while (!VersionedAtomic_EndTryRead(&uptimeState.version, version));

   do_gettimeofday(&tv);
   upBase = Atomic_Read64(&uptimeState.uptimeBase);
   
   monotime = (uint64)(jifs - jifBase) * (UPTIME_FREQ / HZ);
   monotime += monoBase;

   uptime = tv.tv_usec * (UPTIME_FREQ / 1000000) + tv.tv_sec * UPTIME_FREQ;
   uptime += upBase;
   
   /* 
    * Use the jiffies based monotonic time to sanity check gettimeofday.
    * If they differ by more than one second, assume the time of day has
    * been warped, and use the jiffies time to undo (most of) the warp.
    */

   diff = uptime - monotime;
   if (UNLIKELY(diff < -UPTIME_FREQ || diff > UPTIME_FREQ)) {
      /* Compute a new uptimeBase to get uptime back on track. */
      uint64 newUpBase = monotime - (uptime - upBase);

      attempts++;
      if (!Atomic_CMPXCHG64(&uptimeState.uptimeBase, &upBase, &newUpBase) && 
          attempts < 5) {
         /* Another thread updated uptimeBase.  Recalculate uptime. */
         goto retry;
      }
      uptime = monotime;

      Log("%s: detected settimeofday: fixed uptimeBase old %"FMT64"u "
          "new %"FMT64"u attempts %u\n", __func__,
          upBase, newUpBase, attempts);
   }
   *j = jifs;

   return uptime;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIFUptimeResyncMono --
 *
 *      Timer that fires ever second to resynchronize the jiffies based
 *      monotonic time with the uptime.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Resets the monotonic time bases so that jiffies based monotonic
 *      time does not drift from gettimeofday over the long term.
 *
 *----------------------------------------------------------------------
 */

static void
HostIFUptimeResyncMono(compat_timer_arg_t unused)  // IN: ignored
{
   unsigned long jifs;
   uintptr_t flags;

   /* 
    * Read the uptime and the corresponding jiffies value.  This will
    * also correct the uptime (which is based on time of day) if needed
    * before we rebase monotonic time (which is based on jiffies).
    */

   uint64 uptime = HostIFReadUptimeWork(&jifs);

   /* 
    * Every second, recalculate monoBase and jiffiesBase to squash small
    * drift between gettimeofday and jiffies.  Also, this prevents
    * (jiffies - jiffiesBase) wrap on 32-bits.
    */

   SAVE_FLAGS(flags);
   CLEAR_INTERRUPTS();
   VersionedAtomic_BeginWrite(&uptimeState.version);

   uptimeState.monotimeBase = uptime;
   uptimeState.jiffiesBase  = jifs;

   VersionedAtomic_EndWrite(&uptimeState.version);
   RESTORE_FLAGS(flags);

   /* Reschedule this timer to expire in one second. */
   mod_timer(&uptimeState.timer, jifs + HZ);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_InitUptime --
 *
 *      Initialize the uptime clock's state.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Sets the initial values for the uptime state, and schedules
 *      the uptime timer.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_InitUptime(void)
{
   struct timeval tv;

   uptimeState.jiffiesBase = jiffies;
   do_gettimeofday(&tv);
   Atomic_Write64(&uptimeState.uptimeBase, 
                  -(tv.tv_usec * (UPTIME_FREQ / 1000000) + 
                    tv.tv_sec * UPTIME_FREQ));

   timer_setup(&uptimeState.timer, HostIFUptimeResyncMono, 0);
   mod_timer(&uptimeState.timer, jiffies + HZ);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_CleanupUptime --
 *
 *      Cleanup uptime state, called at module unloading time.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Deschedule the uptime timer.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_CleanupUptime(void)
{
   del_timer_sync(&uptimeState.timer);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_ReadUptime --
 *
 *      Read the system time.  Returned value has no particular absolute
 *      value, only difference since previous call should be used.
 *
 * Results:
 *      Units are given by HostIF_UptimeFrequency.
 *
 * Side effects:
 *      See HostIFReadUptimeWork
 *
 *----------------------------------------------------------------------
 */

uint64
HostIF_ReadUptime(void)
{
   unsigned long jifs;

   return HostIFReadUptimeWork(&jifs);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_UptimeFrequency
 *
 *      Return the frequency of the counter that HostIF_ReadUptime reads.
 *
 * Results:
 *      Frequency in Hz.
 *
 * Side effects:
 *      None
 *
 *----------------------------------------------------------------------
 */

uint64
HostIF_UptimeFrequency(void)
{
   return UPTIME_FREQ;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_CopyFromUser --
 *
 *      Copy memory from the user application into a kernel buffer. This
 *      function may block, so don't call it while holding any kind of
 *      lock. --hpreg
 *
 * Results:
 *      0 on success
 *      -EFAULT on failure.
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

int
HostIF_CopyFromUser(void *dst,        // OUT
                    const void *src,  // IN
                    unsigned int len) // IN
{
   return copy_from_user(dst, src, len) ? -EFAULT : 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_CopyToUser --
 *
 *      Copy memory to the user application from a kernel buffer. This
 *      function may block, so don't call it while holding any kind of
 *      lock. --hpreg
 *
 * Results:
 *      0 on success
 *      -EFAULT on failure.
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

int 
HostIF_CopyToUser(void *dst,        // OUT
                  const void *src,  // IN
                  unsigned int len) // IN
{
   return copy_to_user(dst, src, len) ? -EFAULT : 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_MapCrossPage --
 *    
 *    Obtain kernel pointer to crosspage. 
 *
 *    We must return a VA that is obtained through a kernel mapping, so that 
 *    the mapping never goes away (see bug 29753).
 *
 *    However, the LA corresponding to that VA must not overlap with the 
 *    monitor (see bug 32922). The userland code ensures that by only 
 *    allocating cross pages from low memory. For those pages, the kernel 
 *    uses a permanent mapping, instead of a temporary one with a high LA.
 *
 * Results:
 *    The kernel virtual address on success
 *    NULL on failure
 *
 * Side effects:
 *    None
 *
 *-----------------------------------------------------------------------------
 */

void *
HostIF_MapCrossPage(VMDriver *vm, // IN
                    VA64 uAddr)   // IN
{
   void *p = VA64ToPtr(uAddr);
   struct page *page;
   VA           vPgAddr;
   VA           ret;

   if (HostIFGetUserPages(p, &page, 1)) {
      return NULL;
   }
   vPgAddr = (VA) MapCrossPage(page);
   HostIF_VMLock(vm, 27);
   if (vm->vmhost->crosspagePagesCount >= MAX_INITBLOCK_CPUS) {
      HostIF_VMUnlock(vm, 27);
      UnmapCrossPage(page, (void*)vPgAddr);

      return NULL;
   }
   vm->vmhost->crosspagePages[vm->vmhost->crosspagePagesCount++] = page;
   HostIF_VMUnlock(vm, 27);

   ret = vPgAddr | (((VA)p) & (PAGE_SIZE - 1));

   return (void*)ret;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_AllocCrossGDT --
 *
 *      Allocate the per-vmmon cross GDT page set.
 *
 *      See bora/doc/worldswitch-pages.txt for the requirements on the cross
 *      GDT page set addresses.
 *
 * Results:
 *      On success: Host kernel virtual address of the first cross GDT page.
 *                  Use HostIF_FreeCrossGDT() with the same value to free.
 *                  The 'crossGDTMPNs' array is filled with the MPNs of all the
 *                  cross GDT pages.
 *      On failure: NULL.
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

void *
HostIF_AllocCrossGDT(uint32 numPages,     // IN: Number of pages
                     MPN maxValidFirst,   // IN: Highest valid MPN of first page
                     MPN *crossGDTMPNs)   // OUT: Array of MPNs
{
   MPN startMPN;
   struct page *pages;
   uint32 i;
   void *crossGDT;

   /*
    * In practice, allocating a low page (MPN <= 0x100000 - 1) is equivalent to
    * allocating a page with MPN <= 0xFEC00 - 1:
    *
    * o PC architecture guarantees that there is no RAM in top 16MB of 4GB
    *   range.
    *
    * o 0xFEC00000 is IOAPIC base.  There could be RAM immediately below,
    *   but not above.
    *
    * How do we allocate a low page? We can safely use GFP_DMA32 when
    * available.  On 64bit kernels before GFP_DMA32 was introduced we
    * fall back to DMA zone (which is not quite necessary for boxes
    * with less than ~3GB of memory).  On 32bit kernels we are using
    * normal zone - which is usually 1GB, and at most 4GB (for 4GB/4GB
    * kernels).  And for 4GB/4GB kernels same restriction as for 64bit
    * kernels applies - there is no RAM in top 16MB immediately below
    * 4GB so alloc_pages() cannot return such page.
    */

   ASSERT(0xFEC00 - 1 <= maxValidFirst);
   for (i = 0; (1 << i) < numPages; i++) { }
#ifdef GFP_DMA32
   pages = alloc_pages(GFP_KERNEL | GFP_DMA32, i);
#else
   pages = alloc_pages(GFP_KERNEL | GFP_DMA, i);
#endif
   crossGDT = NULL;
   if (pages == NULL) {
      Warning("%s: unable to alloc crossGDT (%u)\n", __func__, i);
   } else {
      startMPN = page_to_pfn(pages);
      for (i = 0; i < numPages; i++) {
         crossGDTMPNs[i] = startMPN + i;
      }
      crossGDT = (void *)page_address(pages);
   }

   return crossGDT;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_FreeCrossGDT --
 *
 *      Free the per-vmmon cross GDT page set allocated with
 *      HostIF_AllocCrossGDT().
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_FreeCrossGDT(uint32 numPages, // IN: Number of pages
                    void *crossGDT)  // IN: Kernel VA of first cross GDT page
{
   uint32 i;

   for (i = 0; (1 << i) < numPages; i++) { }
   free_pages((VA)crossGDT, i);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_VMLock --
 *
 *      Grabs per-VM data structure lock. The lock is not recursive.
 *      The global lock has lower rank so the global lock should be grabbed
 *      first if both locks are acquired.
 *
 *      It should be a medium contention lock. Also it should be fast:
 *      it is used for protecting of frequent page allocation and locking.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      The current thread is rescheduled if the lock is busy.
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_VMLock(VMDriver *vm, // IN
              int callerID) // IN
{
   ASSERT(vm);

   ASSERT(vm->vmhost);
   MutexLock(&vm->vmhost->vmMutex, callerID);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_VMUnlock --
 *
 *      Releases per-VM data structure lock.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Can wake up the thread blocked on this lock. 
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_VMUnlock(VMDriver *vm, // IN
                int callerID) // IN
{
   ASSERT(vm);

   ASSERT(vm->vmhost);
   MutexUnlock(&vm->vmhost->vmMutex, callerID);
}


#ifdef VMX86_DEBUG
/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_VMLockIsHeld --
 *
 *      Determine if the per-VM lock is held by the current thread.
 * 
 * Results:
 *      TRUE if yes
 *      FALSE if no
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

Bool
HostIF_VMLockIsHeld(VMDriver *vm) // IN
{
   ASSERT(vm);
   ASSERT(vm->vmhost);

   return MutexIsLocked(&vm->vmhost->vmMutex);
}
#endif


/*
 * Utility routines for accessing and enabling the APIC
 */

/*
 * Defines for accessing the APIC.  We use readl/writel to access the APIC
 * which is how Linux wants you to access I/O memory (though on the x86
 * just dereferencing a pointer works just fine).
 */
#define APICR_TO_ADDR(apic, reg)      (apic + (reg << 4))
#define GET_APIC_REG(apic, reg)       (readl(APICR_TO_ADDR(apic, reg)))
#define SET_APIC_REG(apic, reg, val)  (writel(val, APICR_TO_ADDR(apic, reg)))

#define APIC_MAXLVT(apic)             ((GET_APIC_REG(apic, APICR_VERSION) >> 16) & 0xff)
#define APIC_VERSIONREG(apic)         (GET_APIC_REG(apic, APICR_VERSION) & 0xff)


#if defined(CONFIG_SMP) || defined(CONFIG_X86_UP_IOAPIC) || \
    defined(CONFIG_X86_UP_APIC) || defined(CONFIG_X86_LOCAL_APIC)
/*
 *----------------------------------------------------------------------
 *
 * isVAReadable --
 *
 *      Verify that passed VA is accessible without crash...
 *
 * Results:
 *      TRUE if address is readable, FALSE otherwise.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */
 
static Bool
isVAReadable(VA r)  // IN:
{
   mm_segment_t old_fs;
   uint32 dummy;
   int ret;
   
   old_fs = get_fs();
   set_fs(get_ds());
   r = APICR_TO_ADDR(r, APICR_VERSION);
   ret = HostIF_CopyFromUser(&dummy, (void*)r, sizeof(dummy));
   set_fs(old_fs);

   return ret == 0;
}


/*
 *----------------------------------------------------------------------
 *
 * SetVMAPICAddr --
 *
 *      Maps the host cpu's APIC.  The virtual address is stashed in
 *      the VMDriver structure.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      The VMDriver structure is updated.
 *
 *----------------------------------------------------------------------
 */

static void
SetVMAPICAddr(VMDriver *vm, // IN/OUT: driver state
              MA ma)	    // IN: host APIC's ma
{
   volatile void *hostapic;

   ASSERT_ON_COMPILE(APICR_SIZE <= PAGE_SIZE);
   hostapic = (volatile void *) ioremap_nocache(ma, PAGE_SIZE);
   if (hostapic) {
      if ((APIC_VERSIONREG(hostapic) & 0xF0) == 0x10) {
	 vm->hostAPIC.base = (volatile uint32 (*)[4]) hostapic;
	 ASSERT(vm->vmhost != NULL);
	 vm->vmhost->hostAPICIsMapped = TRUE;
      } else {
	 iounmap((void*)hostapic);
      }
   }
}


/*
 *----------------------------------------------------------------------
 *
 * ProbeAPIC --
 *
 *      Attempts to map the host APIC.
 *
 *      Most versions of Linux already provide access to a mapped
 *      APIC.  This function is just a backup.
 *
 *      Caveat: We assume that the APIC physical address is the same
 *      on all host cpus.
 *
 * Results:
 *      TRUE if APIC was found, FALSE if not.
 *
 * Side effects:
 *      May map the APIC.
 *
 *----------------------------------------------------------------------
 */

static Bool
ProbeAPIC(VMDriver *vm,   // IN/OUT: driver state
	  Bool setVMPtr)  // IN: set a pointer to the APIC's virtual address
{
   MA ma = APIC_GetMA();
   
   if (ma == (MA)-1) {
      return FALSE;
   }

   if (setVMPtr) {
      SetVMAPICAddr(vm, ma);
   } else {
      vm->hostAPIC.base = NULL;
   }

   return TRUE;
}
#endif


/*
 *----------------------------------------------------------------------
 *
 * HostIF_APICInit --
 *
 *      Initialize APIC behavior.
 *      Attempts to map the host APIC into vm->hostAPIC.
 *
 *      We don't attempt to refresh the mapping after a host cpu
 *      migration.  Fortunately, hosts tend to use the same address
 *      for all APICs.
 *
 *      Most versions of Linux already provide a mapped APIC.  We
 *      have backup code to read APIC_BASE and map it, if needed.
 *
 * Results:
 *      TRUE
 *
 * Side effects:
 *      May map the host APIC.
 *
 *----------------------------------------------------------------------
 */
Bool
HostIF_APICInit(VMDriver *vm,   // IN:
                Bool setVMPtr,  // IN:
                Bool probe)     // IN: force probing
{
#if defined(CONFIG_SMP)         || defined(CONFIG_X86_UP_IOAPIC) || \
    defined(CONFIG_X86_UP_APIC) || defined(CONFIG_X86_LOCAL_APIC)
   static Bool apicIPILogged = FALSE;
   VA kAddr;

   monitorIPIVector = SPURIOUS_APIC_VECTOR;
#if defined(POSTED_INTR_VECTOR)
   hvIPIVector      = POSTED_INTR_VECTOR;
#else
   hvIPIVector      = 0;
#endif


   if (!apicIPILogged) {
      Log("Monitor IPI vector: %x\n", monitorIPIVector);
      Log("HV      IPI vector: %x\n", hvIPIVector);
      apicIPILogged = TRUE;
   }

   if ((__GET_MSR(MSR_APIC_BASE) & APIC_MSR_X2APIC_ENABLED) != 0) {
      if (setVMPtr) {
         vm->hostAPIC.base = NULL;
         vm->vmhost->hostAPICIsMapped = FALSE;
         vm->hostAPIC.isX2 = TRUE;
      }
      return TRUE;
   }

   if (probe && ProbeAPIC(vm, setVMPtr)) {
      return TRUE;
   }

   /*
    * Normal case: use Linux's pre-mapped APIC.
    */
   kAddr = __fix_to_virt(FIX_APIC_BASE);
   if (!isVAReadable(kAddr)) {
      return TRUE;
   }
   if (setVMPtr) {
      vm->hostAPIC.base = (void *)kAddr;
   } else {
      vm->hostAPIC.base = NULL;
   }
#endif
   return TRUE;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_SemaphoreWait --
 *
 *    Perform the semaphore wait (P) operation, possibly blocking.
 *
 * Result:
 *    1 (which equals MX_WAITNORMAL) if success, 
 *    negated error code otherwise.
 *
 * Side-effects:
 *    None
 *
 *-----------------------------------------------------------------------------
 */

int   
HostIF_SemaphoreWait(VMDriver *vm,   // IN:
                     Vcpuid vcpuid,  // IN:
                     uint64 *args)   // IN:
{
   struct file *file;
   mm_segment_t old_fs;
   int res;
   int waitFD = args[0];
   int timeoutms = args[2];
   uint64 value;

   file = vmware_fget(waitFD);
   if (file == NULL) {
      return MX_WAITERROR;
   }

   old_fs = get_fs();
   set_fs(get_ds());

   {
      struct poll_wqueues table;
      unsigned int mask;
      
      poll_initwait(&table);
      current->state = TASK_INTERRUPTIBLE;
      mask = file->f_op->poll(file, &table.pt);
      if (!(mask & (POLLIN | POLLERR | POLLHUP))) {
         vm->vmhost->vcpuSemaTask[vcpuid] = current;
         schedule_timeout(timeoutms * HZ / 1000);  // convert to Hz
         vm->vmhost->vcpuSemaTask[vcpuid] = NULL;
      }
      current->state = TASK_RUNNING;
      poll_freewait(&table);
   }

   /*
    * Userland only writes in multiples of sizeof(uint64). This will allow
    * the code to happily deal with a pipe or an eventfd. We only care about
    * reading no bytes (EAGAIN - non blocking fd) or sizeof(uint64).
    */

   res = file->f_op->read(file, (char *) &value, sizeof value, &file->f_pos);

   if (res == sizeof value) {
      res = MX_WAITNORMAL;
   } else {
      if (res == 0) {
         res = -EBADF;
      }
   }

   set_fs(old_fs);
   fput(file);

   /*
    * Handle benign errors:
    * EAGAIN is MX_WAITTIMEDOUT.
    * The signal-related errors are all mapped into MX_WAITINTERRUPTED.
    */

   switch (res) {
   case -EAGAIN:
      res = MX_WAITTIMEDOUT;
      break;
   case -EINTR:
   case -ERESTART:
   case -ERESTARTSYS:
   case -ERESTARTNOINTR:
   case -ERESTARTNOHAND:
      res = MX_WAITINTERRUPTED;
      break;
   case -EBADF:
      res = MX_WAITERROR;
      break;
   }
   return res;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_SemaphoreForceWakeup --
 *
 *    For each VCPU in the set whose target process is lightly sleeping (i.e.
 *    TASK_INTERRUPTIBLE), wake it up.  The target process can be waiting on a
 *    semaphore or due to a call to Vmx86_YieldToSet.
 *
 * Result:
 *    None.
 *
 * Side-effects:
 *    None
 *
 *-----------------------------------------------------------------------------
 */

void 
HostIF_SemaphoreForceWakeup(VMDriver *vm,       // IN:
                            const VCPUSet *vcs) // IN:
{
   FOR_EACH_VCPU_IN_SET(vcs, vcpuid) {
      struct task_struct *t = vm->vmhost->vcpuSemaTask[vcpuid];
      vm->vmhost->vcpuSemaTask[vcpuid] = NULL;
      if (t && (t->state & TASK_INTERRUPTIBLE)) {
         wake_up_process(t);
      }
   } ROF_EACH_VCPU_IN_SET();
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_SemaphoreSignal --
 *
 *      Perform the semaphore signal (V) operation.
 *
 * Result:
 *      On success: MX_WAITNORMAL (1).
 *      On error: MX_WAITINTERRUPTED (3) if interrupted by a Unix signal (we
 *                   can block on a preemptive kernel).
 *                MX_WAITERROR (0) on generic error.
 *                Negated system error (< 0).
 *
 * Side-effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

int
HostIF_SemaphoreSignal(uint64 *args)  // IN:
{
   struct file *file;
   mm_segment_t old_fs;
   int res;
   int signalFD = args[1];
   uint64 value = 1;  // make an eventfd happy should it be there

   file = vmware_fget(signalFD);
   if (!file) {
      return MX_WAITERROR;
   }

   old_fs = get_fs();
   set_fs(get_ds());

   /*
    * Always write sizeof(uint64) bytes. This works fine for eventfd and
    * pipes. The data written is formatted to make an eventfd happy should
    * it be present.
    */

   res = file->f_op->write(file, (char *) &value, sizeof value, &file->f_pos);

   if (res == sizeof value) {
      res = MX_WAITNORMAL;
   }

   set_fs(old_fs);
   fput(file);

   /*
    * Handle benign errors:
    * EAGAIN is MX_WAITTIMEDOUT.
    * The signal-related errors are all mapped into MX_WAITINTERRUPTED.
    */

   switch (res) {
   case -EAGAIN:
      // The pipe is full, so it is already signalled. Success.
      res = MX_WAITNORMAL;
      break;
   case -EINTR:
   case -ERESTART:
   case -ERESTARTSYS:
   case -ERESTARTNOINTR:
   case -ERESTARTNOHAND:
      res = MX_WAITINTERRUPTED;
      break;
   }
   return res;
}

#if ((LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)) || !defined(CONFIG_SMP))
#   define VMMON_USE_CALL_FUNC
#endif

#if defined(VMMON_USE_CALL_FUNC)
/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverIPIHandler  --
 *
 *      Null IPI handler - for monitor to notice AIO completion
 *
 *----------------------------------------------------------------------
 */
void
LinuxDriverIPIHandler(void *info)
{
   return;
}

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 17)
#define VMMON_CALL_FUNC_SYNC 0  // async; we've not seen any problems
#else
#define VMMON_CALL_FUNC_SYNC 1  // sync; insure no problems from old releases
#endif

#endif


/*
 *----------------------------------------------------------------------
 *
 * HostIF_IPI --
 *
 *    If the passed VCPU threads are on some CPUs in the system,
 *    attempt to hit them with an IPI.
 *
 *    On older Linux systems we do a broadcast.
 *
 * Result:
 *    The mode used to send IPIs.
 *
 *----------------------------------------------------------------------
 */

HostIFIPIMode
HostIF_IPI(VMDriver *vm,                // IN:
           const VCPUSet *ipiTargets)   // IN:
{
   HostIFIPIMode mode = IPI_NONE;

   ASSERT(vm);

   FOR_EACH_VCPU_IN_SET(ipiTargets, v) {
      uint32 targetHostCpu = vm->currentHostCpu[v];
      if (targetHostCpu != INVALID_PCPU) {
         ASSERT(targetHostCpu < MAX_PCPUS);
#if defined(VMMON_USE_CALL_FUNC)
         /* older kernels IPI broadcast; use async when possible */
         (void) compat_smp_call_function(LinuxDriverIPIHandler,
                                         NULL, VMMON_CALL_FUNC_SYNC);
	 mode = IPI_BROADCAST;
	 break;
#else
         /* Newer kernels have (async) IPI targetting */
         arch_send_call_function_single_ipi(targetHostCpu);
	 mode = IPI_UNICAST;
#endif
      }
   } ROF_EACH_VCPU_IN_SET();

   return mode;
}


typedef struct {
   Atomic_uint32 index;
   CPUIDQuery *query;
} HostIFGetCpuInfoData;


/*
 *-----------------------------------------------------------------------------
 *
 * HostIFGetCpuInfo --
 *
 *      Collect CPUID information on the current logical CPU.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      'data->index' is atomically incremented by one.
 *
 *-----------------------------------------------------------------------------
 */

static void
HostIFGetCpuInfo(void *clientData) // IN/OUT: A HostIFGetCpuInfoData *
{
   HostIFGetCpuInfoData *data = (HostIFGetCpuInfoData *)clientData;
   CPUIDQuery *query;
   uint32 index;

   ASSERT(data);
   query = data->query;
   ASSERT(query);

   index = Atomic_ReadInc32(&data->index);
   if (index >= query->numLogicalCPUs) {
      return;
   }

   query->logicalCPUs[index].tag = HostIF_GetCurrentPCPU();
   __GET_CPUID2(query->eax, query->ecx, &query->logicalCPUs[index].regs);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_GetAllCpuInfo --
 *
 *      Collect CPUID information on all logical CPUs.
 *
 *      'query->numLogicalCPUs' is the size of the 'query->logicalCPUs' output
 *      array.
 *
 * Results:
 *      On success: TRUE. 'query->logicalCPUs' is filled and
 *                  'query->numLogicalCPUs' is adjusted accordingly.
 *      On failure: FALSE. Happens if 'query->numLogicalCPUs' was too small.
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

Bool
HostIF_GetAllCpuInfo(CPUIDQuery *query) // IN/OUT
{
   HostIFGetCpuInfoData data;

   Atomic_Write32(&data.index, 0);
   data.query = query;

   /*
    * XXX Linux has userland APIs to bind a thread to a processor, so we could
    *     probably implement this in userland like we do on Win32.
    */

   HostIF_CallOnEachCPU(HostIFGetCpuInfo, &data);

   /*
    * At this point, Atomic_Read32(&data.index) is the number of logical CPUs
    * who replied.
    */

   if (Atomic_Read32(&data.index) > query->numLogicalCPUs) {
      return FALSE;
   }

   ASSERT(Atomic_Read32(&data.index) <= query->numLogicalCPUs);
   query->numLogicalCPUs = Atomic_Read32(&data.index);

   return TRUE;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_CallOnEachCPU --
 *
 *      Call specified function once on each CPU.  No ordering guarantees.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.  May be slow.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_CallOnEachCPU(void (*func)(void*), // IN: function to call
                     void *data)          // IN/OUT: argument to function
{
   preempt_disable();
   (*func)(data);
   (void)compat_smp_call_function(*func, data, 1);
   preempt_enable();
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIFCheckTrackedMPN --
 *
 *      Check if a given MPN is tracked for the specified VM.
 *
 * Result:
 *      TRUE if the MPN is tracked in one of the trackers for the specified VM,
 *      FALSE otherwise.
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

Bool
HostIFCheckTrackedMPN(VMDriver *vm, // IN: The VM instance
                      MPN mpn)      // IN: The MPN
{
   VMHost * const vmh = vm->vmhost;

   if (vmh == NULL) {
      return FALSE;
   }

   HostIF_VMLock(vm, 32); // Debug version of PhysTrack wants VM's lock.
   if (vmh->lockedPages) {
      if (PhysTrack_Test(vmh->lockedPages, mpn)) {
         HostIF_VMUnlock(vm, 32);
         return TRUE;
      }
   }

   if (vmh->AWEPages) {
      if (PhysTrack_Test(vmh->AWEPages, mpn)) {
         HostIF_VMUnlock(vm, 32);
         return TRUE;
      }
   }

   if (vm->memtracker) {
      if (MemTrack_LookupMPN(vm->memtracker, mpn) != NULL) {
         HostIF_VMUnlock(vm, 32);
         return TRUE;
      }
   }
   HostIF_VMUnlock(vm, 32);

   if (vmx86_debug) {
      /*
       * The monitor may have old KSeg mappings to pages which it no longer
       * owns.  Minimize customer noise by only logging this for developers.
       */
      Log("%s: MPN %" FMT64 "x not owned by this VM\n", __FUNCTION__, mpn);
   }
   return FALSE;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_ReadPage --
 *
 *      Reads one page of data from a machine page and returns it in the
 *      specified kernel or user buffer.  The machine page must be owned by
 *      the specified VM.
 *
 * Results:
 *      0 on success
 *      negative error code on error
 *
 * Side effects:
 *      none
 *
 *----------------------------------------------------------------------
 */

int
HostIF_ReadPage(VMDriver *vm,        // IN: The VM instance
                MPN mpn,             // MPN of the page
                VA64 addr,           // buffer for data
                Bool kernelBuffer)   // is the buffer in kernel space?
{
   void *buf = VA64ToPtr(addr);
   int ret = 0;
   const void* ptr;
   struct page* page;

   if (mpn == INVALID_MPN) {
      return -EFAULT;
   }
   if (HostIFCheckTrackedMPN(vm, mpn) == FALSE) {
      return -EFAULT;
   }

   page = pfn_to_page(mpn);
   ptr = kmap(page);
   if (ptr == NULL) {
      return -ENOMEM;
   }
   
   if (kernelBuffer) {
      memcpy(buf, ptr, PAGE_SIZE);
   } else {
      ret = HostIF_CopyToUser(buf, ptr, PAGE_SIZE);
   }
   kunmap(page);

   return ret;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_WritePage --
 *
 *      Writes one page of data from a kernel or user buffer onto the specified
 *      machine page.  The machine page must be owned by the specified VM.
 *
 * Results:
 *      0 on success
 *      negative error code on error
 *
 * Side effects:
 *      none
 *
 *----------------------------------------------------------------------
 */

int
HostIFWritePageWork(MPN mpn,              // MPN of the page
                    VA64 addr,            // data to write to the page
                    Bool kernelBuffer)    // is the buffer in kernel space?
{
   void const *buf = VA64ToPtr(addr);
   int ret = 0;
   void* ptr;
   struct page* page;

   if (mpn == INVALID_MPN) {
      return -EFAULT;
   }

   page = pfn_to_page(mpn);
   ptr = kmap(page);
   if (ptr == NULL) {
      return -ENOMEM;
   }

   if (kernelBuffer) {
      memcpy(ptr, buf, PAGE_SIZE);
   } else {
      ret = HostIF_CopyFromUser(ptr, buf, PAGE_SIZE);
   }
   kunmap(page);

   return ret;
}

int
HostIF_WritePage(VMDriver *vm,      // IN: The VM instance
                 MPN mpn,              // MPN of the page
                 VA64 addr,            // data to write to the page
                 Bool kernelBuffer)    // is the buffer in kernel space?
{
   if (HostIFCheckTrackedMPN(vm, mpn) == FALSE) {
      return -EFAULT;
   }
   return HostIFWritePageWork(mpn, addr, kernelBuffer);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_WriteMachinePage --
 *
 *      Puts the content of a machine page into a kernel or user mode
 *      buffer.  This should only be used for host-global pages, not any
 *      VM-owned pages.
 *
 * Results:
 *      On success: 0
 *      On failure: a negative error code
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

int
HostIF_WriteMachinePage(MPN mpn,   // IN: MPN of the page
                        VA64 addr) // IN: data to write to the page
{
   return HostIFWritePageWork(mpn, addr, TRUE);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_GetLockedPageList --
 *
 *      puts MPNs of pages that were allocated by HostIF_AllocLockedPages()
 *      into user mode buffer.
 *
 * Results:
 *      non-negative number of the MPNs in the buffer on success.
 *      negative error code on error (-EFAULT)
 *
 * Side effects:
 *      none
 *
 *----------------------------------------------------------------------
 */

int
HostIF_GetLockedPageList(VMDriver* vm,          // IN: VM instance pointer
                         VA64 uAddr,            // OUT: user mode buffer for MPNs
                         unsigned int numPages) // IN: size of the buffer in MPNs
{
   MPN *mpns = VA64ToPtr(uAddr);
   MPN mpn;
   unsigned count;

   struct PhysTracker* AWEPages;

   if (!vm->vmhost || !vm->vmhost->AWEPages) {
      return 0;
   }
   AWEPages = vm->vmhost->AWEPages;

   for (mpn = 0, count = 0;
        (count < numPages) &&
        (INVALID_MPN != (mpn = PhysTrack_GetNext(AWEPages, mpn)));
        count++) {

      if (HostIF_CopyToUser(&mpns[count], &mpn, sizeof *mpns) != 0) {
         return -EFAULT;
      }
   }

   return count;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_GetNextAnonPage --
 *
 *      If "inMPN" is INVALID_MPN gets the first MPN in the anon mpn list else
 *      gets the anon mpn after "inMPN" in the anon mpn list.
 *
 * Results:
 *      Next anon MPN. If the list has been exhausted, returns INVALID_MPN.
 *
 *-----------------------------------------------------------------------------
 */

MPN
HostIF_GetNextAnonPage(VMDriver *vm, MPN inMPN)
{
   if (!vm->vmhost || !vm->vmhost->AWEPages) {
      return INVALID_MPN;
   }
   return PhysTrack_GetNext(vm->vmhost->AWEPages, inMPN);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_GetCurrentPCPU --
 *
 *    Get current physical CPU id.  Interrupts should be disabled so
 *    that the thread cannot move to another CPU.
 *
 * Results:
 *    Host CPU number.
 *
 * Side effects:
 *    None.
 *
 *---------------------------------------------------------------------- 
 */

uint32
HostIF_GetCurrentPCPU(void)
{
   return smp_processor_id();
}


#ifdef VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT
/*
 *----------------------------------------------------------------------
 *
 * HostIFWakeupClockThread --
 *
 *      Wake up the fast clock thread.  Can't do this from the timer
 *      callback, because it holds locks that the scheduling code
 *      might take. 
 *
 * Results:
 *      None.
 *      
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

static void 
HostIFWakeupClockThread(unsigned long data)  //IN:
{
   wake_up_process(linuxState.fastClockThread);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIFTimerCallback --
 *      
 *      Schedule a tasklet to wake up the fast clock thread.
 *
 * Results:
 *      Tell the kernel not to restart the timer.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */
 
static enum hrtimer_restart 
HostIFTimerCallback(struct hrtimer *timer)  //IN:
{
   tasklet_schedule(&timerTasklet);

   return HRTIMER_NORESTART;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIFScheduleHRTimeout --
 *      
 *      Schedule an hrtimer to wake up the fast clock thread.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      Sleep.
 *
 *----------------------------------------------------------------------
 */

static void 
HostIFScheduleHRTimeout(ktime_t *expires)  //IN:
{
   struct hrtimer t;

   if (expires && !expires->tv64) {
      __set_current_state(TASK_RUNNING);

      return;
   }

   hrtimer_init(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
   t.function = HostIFTimerCallback;
   hrtimer_start(&t, *expires, HRTIMER_MODE_REL);

   if (hrtimer_active(&t)) {
      schedule();
   }
   
   hrtimer_cancel(&t);
   __set_current_state(TASK_RUNNING);
}
#endif //VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT


#ifndef VMMON_USE_HIGH_RES_TIMERS
/*
 *----------------------------------------------------------------------
 *
 * HostIFDoIoctl --
 *
 *    Issue ioctl.  Assume kernel is not locked.  It is not true now,
 *    but it makes things easier to understand, and won't surprise us
 *    later when we get rid of kernel lock from our code.
 *
 * Results:
 *    Same as ioctl method.
 *
 * Side effects:
 *    none.
 *
 *---------------------------------------------------------------------- 
 */

static long
HostIFDoIoctl(struct file *filp,
              u_int iocmd,
              unsigned long ioarg)
{
   if (filp->f_op->unlocked_ioctl) {
      return filp->f_op->unlocked_ioctl(filp, iocmd, ioarg);
   }
   return -ENOIOCTLCMD;
}
#endif //VMON_USE_HIGH_RES_TIMERS


/*
 *----------------------------------------------------------------------
 *
 * HostIFStartTimer --
 *
 *      Starts the timer using either /dev/rtc or high-resolution timers.
 *
 * Results:
 *      Returns 0 on success, -1 on failure.
 *
 * Side effects:
 *      Sleep until timer expires.
 *
 *----------------------------------------------------------------------
 */

int
HostIFStartTimer(Bool rateChanged,  //IN: Did rate change? 
		 unsigned int rate, //IN: current clock rate
                 struct file *filp) //IN: /dev/rtc descriptor
{
#ifdef VMMON_USE_HIGH_RES_TIMERS
   static unsigned long slack = 0;
   static ktime_t expires;
   int timerPeriod;

   if (rateChanged) {
      timerPeriod = NSEC_PER_SEC / rate; 
      expires = ktime_set(0, timerPeriod);
      /*
       * Allow the kernel to expire the timer at its convenience.
       * ppoll() uses 0.1% of the timeout value.  I think we can
       * tolerate 1%.
       */
          
      slack = timerPeriod / 100;
   }
   set_current_state(TASK_INTERRUPTIBLE);
#   ifdef VMMON_USE_SCHEDULE_HRTIMEOUT
   schedule_hrtimeout_range(&expires, slack, HRTIMER_MODE_REL);
#   else
   HostIFScheduleHRTimeout(&expires);
#   endif
#else
   unsigned p2rate;
   int res;
   unsigned long buf;
   loff_t pos = 0;

   if (rateChanged) {
      /*
       * The host will already have HZ timer interrupts per second.  So
       * in order to satisfy the requested rate, we need up to (rate -
       * HZ) additional interrupts generated by the RTC.  That way, if
       * the guest ask for a bit more than 1024 virtual interrupts per
       * second (which is a common case for Windows with multimedia
       * timers), we'll program the RTC to 1024 rather than 2048, which
       * saves a considerable amount of CPU.  PR 519228.
       */
      if (rate > HZ) {
         rate -= HZ;
      } else {
         rate = 0;
      }
      /*
       * Don't set the RTC rate to 64 Hz or lower: some kernels have a
       * bug in the HPET emulation of RTC that will cause the RTC
       * frequency to get stuck at 64Hz.  See PR 519228 comment #23.
       */
      p2rate = 128;
      // Hardware rate must be a power of 2
      while (p2rate < rate && p2rate < 8192) {
         p2rate <<= 1;
      }

      res = HostIFDoIoctl(filp, RTC_IRQP_SET, p2rate);
      if (res < 0) {
         Warning("/dev/rtc set rate %d failed: %d\n", p2rate, res);

         return -1;
      }
      if (kthread_should_stop()) {
         return -1;
      }
   }
   res = filp->f_op->read(filp, (void *) &buf, sizeof(buf), &pos);
   if (res <= 0) {
      if (res != -ERESTARTSYS) {
         Log("/dev/rtc read failed: %d\n", res);
      }

      return -1;
   }
#endif

   return 0;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIFFastClockThread --
 *
 *      Kernel thread that provides finer-grained wakeups than the
 *      main system timers by using /dev/rtc.  We can't do this at
 *      user level because /dev/rtc is not sharable (PR 19266).  Also,
 *      we want to avoid the overhead of a context switch out to user
 *      level on every RTC interrupt.
 *
 * Results:
 *      Returns 0.
 *
 * Side effects:
 *      Wakeups and IPIs.
 *
 *----------------------------------------------------------------------
 */

static int
HostIFFastClockThread(void *data)  // IN:
{
   struct file *filp = (struct file *) data;
   int res;
   mm_segment_t oldFS;
   unsigned int rate = 0;
   unsigned int prevRate = 0;

   oldFS = get_fs();
   set_fs(KERNEL_DS);
   allow_signal(SIGKILL);
   set_user_nice(current, linuxState.fastClockPriority);

   while ((rate = linuxState.fastClockRate) > MIN_RATE) {
      if (kthread_should_stop()) {
         goto out;
      }
      res = HostIFStartTimer(rate != prevRate, rate, filp);
      if (res < 0) {
         goto out;
      }
      prevRate = rate;

#if defined(CONFIG_SMP)
      /*
       * IPI each VCPU thread that is in the monitor and is due to
       * fire a MonTimer callback.
       */
      Vmx86_MonTimerIPI();
#endif

      /*
       * Wake threads that are waiting for a fast poll timeout at
       * userlevel.  This is needed only on Linux.  On Windows,
       * we get shorter timeouts simply by increasing the host
       * clock rate.
       */

      LinuxDriverWakeUp(TRUE);
   }

 out:
   LinuxDriverWakeUp(TRUE);
   set_fs(oldFS);

   /*
    * Do not exit thread until we are told to do so.
    */

   do {
      set_current_state(TASK_UNINTERRUPTIBLE);
      if (kthread_should_stop()) {
         break;
      }
      schedule();
   } while (1);
   set_current_state(TASK_RUNNING);

   return 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_SetFastClockRate --
 *
 *      The monitor wants to poll for events at the given rate.
 *      Ensure that the host OS's timer interrupts come at least at
 *      this rate.  If the requested rate is greater than the rate at
 *      which timer interrupts will occur on CPUs other than 0, then
 *      also arrange to call Vmx86_MonitorPollIPI on every timer
 *      interrupt, in order to relay IPIs to any other CPUs that need
 *      them.
 *
 * Locking:
 *      The caller must hold the fast clock lock.
 *
 * Results:
 *      0 for success; positive error code if /dev/rtc could not be opened.
 *
 * Side effects:
 *      As described above.
 *
 *-----------------------------------------------------------------------------
 */

int
HostIF_SetFastClockRate(unsigned int rate) // IN: Frequency in Hz.
{
   ASSERT(MutexIsLocked(&fastClockMutex));
   linuxState.fastClockRate = rate;

   /*
    * Overview
    * --------
    * An SMP Linux kernel programs the 8253 timer (to increment the 'jiffies'
    * counter) _and_ all local APICs (to run the scheduler code) to deliver
    * interrupts HZ times a second.
    *
    * Time
    * ----
    * The kernel tries very hard to spread all these interrupts evenly over
    * time, i.e. on a 1 CPU system, the 1 local APIC phase is shifted by 1/2
    * period compared to the 8253, and on a 2 CPU system, the 2 local APIC
    * phases are respectively shifted by 1/3 and 2/3 period compared to the
    * 8253. This is done to reduce contention on locks guarding the global task
    * queue.
    *
    * Space
    * -----
    * The 8253 interrupts are distributed between physical CPUs, evenly on a P3
    * system, whereas on a P4 system physical CPU 0 gets all of them.
    *
    * Long story short, unless the monitor requested rate is significantly
    * higher than HZ, we don't need to send IPIs or exclusively grab /dev/rtc
    * to periodically kick vCPU threads running in the monitor on all physical
    * CPUs.
    */

   if (rate > MIN_RATE) {
      if (!linuxState.fastClockThread) {
         struct task_struct *rtcTask;
         struct file *filp = NULL;

#if !defined(VMMON_USE_HIGH_RES_TIMERS)
         int res;

         filp = filp_open("/dev/rtc", O_RDONLY, 0);
         if (IS_ERR(filp)) {
            Warning("/dev/rtc open failed: %d\n", (int)(VA)filp);

            return -(int)(VA)filp;
         }
         res = HostIFDoIoctl(filp, RTC_PIE_ON, 0);
         if (res < 0) {
            Warning("/dev/rtc enable interrupt failed: %d\n", res);
            filp_close(filp, current->files);

            return -res;
         }
#endif
         rtcTask = kthread_run(HostIFFastClockThread, filp, "vmware-rtc");
         if (IS_ERR(rtcTask)) {
            long err = PTR_ERR(rtcTask);

            /*
             * Ignore ERESTARTNOINTR silently, it occurs when signal is
             * pending, and syscall layer automatically reissues operation
             * after signal is handled.
             */

            if (err != -ERESTARTNOINTR) {
               Warning("/dev/rtc cannot start watch thread: %ld\n", err);
            }
	    close_rtc(filp, current->files);

            return -err;
         }
         linuxState.fastClockThread = rtcTask;
	 linuxState.fastClockFile = filp;
      }
   } else {
      if (linuxState.fastClockThread) {
         force_sig(SIGKILL, linuxState.fastClockThread);
         kthread_stop(linuxState.fastClockThread);
	 close_rtc(linuxState.fastClockFile, current->files);

         linuxState.fastClockThread = NULL;
	 linuxState.fastClockFile = NULL;
      }
   }

   return 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_MapUserMem --
 *
 *	Obtain kernel pointer to user memory. The pages backing the user memory
 *      address are locked into memory (this allows the pointer to be used in
 *      contexts where paging is undesirable or impossible).
 *
 * Results:
 *      On success, returns the kernel virtual address, along with a handle to
 *      be used for unmapping.
 *      On failure, returns NULL.
 *
 * Side effects:
 *	Yes.
 *
 *-----------------------------------------------------------------------------
 */

void *
HostIF_MapUserMem(VA addr,                  // IN: User memory virtual address
                  size_t size,              // IN: Size of memory desired
                  VMMappedUserMem **handle) // OUT: Handle to mapped memory
{
   void *p = (void *) (uintptr_t) addr;
   VMMappedUserMem *newHandle;
   VA offset = addr & (PAGE_SIZE - 1);
   size_t numPagesNeeded = ((offset + size) / PAGE_SIZE) + 1;
   size_t handleSize =
      sizeof *newHandle + numPagesNeeded * sizeof newHandle->pages[0];
   void *mappedAddr;

   ASSERT(handle);

   if (!access_ok(VERIFY_WRITE, p, size)) {
      printk(KERN_ERR "%s: Couldn't verify write to uva 0x%p with size %"
             FMTSZ"u\n", __func__, p, size);

      return NULL;
   }

   newHandle = kmalloc(handleSize, GFP_KERNEL);
   if (newHandle == NULL) {
      printk(KERN_ERR "%s: Couldn't allocate %"FMTSZ"u bytes of memory\n",
             __func__, handleSize);

      return NULL;
   }

   if (HostIFGetUserPages(p, newHandle->pages, numPagesNeeded)) {
      kfree(newHandle);
      printk(KERN_ERR "%s: Couldn't get %"FMTSZ"u %s for uva 0x%p\n", __func__,
             numPagesNeeded, numPagesNeeded > 1 ? "pages" : "page", p);

      return NULL;
   }

   if (numPagesNeeded > 1) {
      /*
       * Unlike kmap(), vmap() can fail. If it does, we need to release the
       * pages that we acquired in HostIFGetUserPages().
       */

      mappedAddr = vmap(newHandle->pages, numPagesNeeded, VM_MAP, PAGE_KERNEL);
      if (mappedAddr == NULL) {
         unsigned int i;
         for (i = 0; i < numPagesNeeded; i++) {
            put_page(newHandle->pages[i]);
         }
         kfree(newHandle);
         printk(KERN_ERR "%s: Couldn't vmap %"FMTSZ"u %s for uva 0x%p\n",
                __func__, numPagesNeeded,
                numPagesNeeded > 1 ? "pages" : "page", p);

         return NULL;
      }
   } else {
      mappedAddr = kmap(newHandle->pages[0]);
   }

   printk(KERN_DEBUG "%s: p = 0x%p, offset = 0x%p, numPagesNeeded = %"FMTSZ"u,"
          " handleSize = %"FMTSZ"u, mappedAddr = 0x%p\n",
          __func__, p, (void *)offset, numPagesNeeded, handleSize, mappedAddr); 

   newHandle->numPages = numPagesNeeded;
   newHandle->addr = mappedAddr;
   *handle = newHandle;

   return mappedAddr + offset;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_UnmapUserMem --
 *
 *	Unmap user memory from HostIF_MapUserMem().
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	Yes.
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_UnmapUserMem(VMMappedUserMem *handle) // IN: Handle to mapped memory
{
   unsigned int i;

   if (handle == NULL) {
      return;
   }

   printk(KERN_DEBUG "%s: numPages = %"FMTSZ"u, addr = 0x%p\n",
          __func__, handle->numPages, handle->addr); 

   if (handle->numPages > 1) {
      vunmap(handle->addr);
   } else {
      kunmap(handle->pages[0]);
   }

   for (i = 0; i < handle->numPages; i++) {
      put_page(handle->pages[i]);
   }
   kfree(handle);
}

/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_SafeRDMSR --
 *
 *      Attempt to read a MSR, and handle the exception if the MSR
 *      is unimplemented.
 *
 * Results:
 *      0 if successful, and MSR value is returned via *val.
 *
 *      If the MSR is unimplemented, *val is set to 0, and a
 *      non-zero value is returned: -1 for Win32, -EFAULT for Linux,
 *      and 1 for MacOS.
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */
int
HostIF_SafeRDMSR(unsigned int msr,   // IN
                 uint64 *val)        // OUT: MSR value
{
   int ret;
   unsigned low, high;
   asm volatile("2: rdmsr ; xor %0,%0\n"
                "1:\n\t"
                ".section .fixup,\"ax\"\n\t"
                "3: mov %4,%0 ; jmp 1b\n\t"
                ".previous\n\t"
                VMW_ASM_EXTABLE(2b, 3b)
                : "=r"(ret), "=a"(low), "=d"(high)
                : "c"(msr), "i"(-EFAULT), "1"(0), "2"(0)); // init eax/edx to 0
   *val = (low | ((u64)(high) << 32));

   return ret;
}

Modifier le fichier /usr/lib/vmware/modules/source/vmnet-only/bridge.c afin d'obtenir le suivant:

/*********************************************************
 * Copyright (C) 1998-2013, 2017 VMware, Inc. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation version 2 and no later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 *
 *********************************************************/

#include "driver-config.h"

#define EXPORT_SYMTAB

#include <linux/kernel.h>
#include <linux/version.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/poll.h>

#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/mm.h>
#include "compat_skbuff.h"
#include <linux/sockios.h>
#include <linux/spinlock.h>
#include "compat_sock.h"

#define __KERNEL_SYSCALLS__
#include <asm/io.h>

#include <linux/proc_fs.h>
#include <linux/file.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <net/tcp.h>
#include <net/ipv6.h>

#ifdef CONFIG_NET_RADIO
#   include <linux/wireless.h>
#endif
#include "vmnetInt.h"
#include "compat_netdevice.h"
#include "vnetInt.h"
#include "smac.h"

#define VNET_BRIDGE_HISTORY    48

/*
 * Bytes reserved before start of packet.  As Ethernet header has 14 bytes,
 * to get aligned IP header we must skip 2 bytes before packet.  Not that it
 * matters a lot for us, but using 2 is compatible with what newer 2.6.x
 * kernels do.
 */
#ifndef NET_IP_ALIGN
#define NET_IP_ALIGN	2
#endif

#if LOGLEVEL >= 4
static struct timeval vnetTime;
#endif

typedef struct VNetBridge VNetBridge;

struct VNetBridge {
   struct notifier_block    notifier;       // for device state changes
   char                     name[VNET_NAME_LEN]; // name of net device (e.g., "eth0")
   struct net_device       *dev;            // device structure for 'name'
   struct sock             *sk;             // socket associated with skb's
   struct packet_type       pt;             // used to add packet handler
   Bool                     enabledPromisc; // track if promisc enabled
   Bool                     warnPromisc;    // tracks if warning has been logged
   Bool                     forceSmac;      // whether to use smac unconditionally
   struct sk_buff          *history[VNET_BRIDGE_HISTORY];  // avoid duplicate packets
   spinlock_t		    historyLock;    // protects 'history'
   VNetPort                 port;           // connection to virtual hub
   Bool                     wirelessAdapter; // connected to wireless adapter?
   struct SMACState        *smac;           // device structure for wireless
   VNetEvent_Sender        *eventSender;    // event sender
};

typedef PacketStatus (* SMACINT SMACFunc)(struct SMACState *, SMACPackets *);

static int  VNetBridgeUp(VNetBridge *bridge, Bool rtnlLock);
static void VNetBridgeDown(VNetBridge *bridge, Bool rtnlLock);

static int  VNetBridgeNotify(struct notifier_block *this, u_long msg,
			     void *data);
static int VNetBridgeReceiveFromDev(struct sk_buff *skb,
                                    struct net_device *dev,
                                    struct packet_type *pt,
                                    struct net_device *real_dev);

static void VNetBridgeFree(VNetJack *this);
static void VNetBridgeReceiveFromVNet(VNetJack *this, struct sk_buff *skb);
static Bool VNetBridgeCycleDetect(VNetJack *this, int generation);
static Bool VNetBridgeIsDeviceWireless(struct net_device *dev);
static void VNetBridgePortsChanged(VNetJack *this);
static int  VNetBridgeIsBridged(VNetJack *this);
static int  VNetBridgeProcRead(char *page, char **start, off_t off,
                               int count, int *eof, void *data);
static void VNetBridgeComputeHeaderPosIPv6(struct sk_buff *skb);
static PacketStatus VNetCallSMACFunc(struct SMACState *state,
                                     struct sk_buff **skb, void *startOfData,
                                     SMACFunc func, unsigned int len);


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeStartPromisc --
 *
 *      Set IFF_PROMISC on the peer interface.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      The peer interface IFF_PROMISC flag may be changed.
 *
 *----------------------------------------------------------------------
 */

static void
VNetBridgeStartPromisc(VNetBridge *bridge,      // IN:
                       Bool rtnlLock)           // IN: Acquire RTNL lock
{
   struct net_device *dev = bridge->dev;

   /*
    * Disable wireless cards from going into promiscous mode because those
    * cards which do support RF monitoring would not be able to function
    * correctly i.e. they would not be able to send data packets.
    */
   if (rtnlLock) {
      rtnl_lock();
   }
   if (!bridge->enabledPromisc && !bridge->wirelessAdapter) {
      dev_set_promiscuity(dev, 1);
      bridge->enabledPromisc = TRUE;
      bridge->warnPromisc = FALSE;
      LOG(0, (KERN_NOTICE "bridge-%s: enabled promiscuous mode\n",
	      bridge->name));
   }
   if (rtnlLock) {
      rtnl_unlock();
   }
}


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeStopPromisc --
 *
 *      Restore saved IFF_PROMISC on the peer interface.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      The peer interface IFF_PROMISC flag may be changed.
 *
 *----------------------------------------------------------------------
 */

static void
VNetBridgeStopPromisc(VNetBridge *bridge,       // IN:
                      Bool rtnlLock)            // IN: Acquire RTNL lock
{
   struct net_device *dev = bridge->dev;

   if (rtnlLock) {
      rtnl_lock();
   }
   if (bridge->enabledPromisc && !bridge->wirelessAdapter) {
      dev_set_promiscuity(dev, -1);
      bridge->enabledPromisc = FALSE;
      LOG(0, (KERN_NOTICE "bridge-%s: disabled promiscuous mode\n",
	      bridge->name));
   }
   if (rtnlLock) {
      rtnl_unlock();
   }
}


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeDevCompatible --
 *
 *      Check whether bridge and network device are compatible.
 *
 * Results:
 *      Non-zero if device is good enough for bridge.  Zero otherwise.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

static INLINE_SINGLE_CALLER int
VNetBridgeDevCompatible(VNetBridge *bridge,      // IN: Bridge
                        struct net_device *net)  // IN: Network device
{
#ifdef VMW_NETDEV_HAS_NET
   if (compat_dev_net(net) != &init_net) {
      return 0;
   }
#endif
   return strcmp(net->name, bridge->name) == 0;
}


/*
 *----------------------------------------------------------------------
 *
 * VNetBridge_Create --
 *
 *      Creates a bridge. Allocates struct, allocates internal device,
 *      initializes port/jack, and creates a proc entry. Finally, creates an
 *      event sender and register itself with the kernel for device state
 *      change notifications.
 *
 *      At this time the bridge is not yet plugged into the hub, because this
 *      will be done by the caller, i.e. the driver. But we need to know the
 *      hub in order to create an event sender. This allows for enabling
 *      the notification mechanism, which will instantly start firing, which in
 *      turn will bring up the bridge (if present), which eventually will
 *      inject bridge events. Moreover, the bridge will start injecting
 *      packets, which will be dropped on the floor. All in all, this is not
 *      that elegant. Alternatively, we could (i) plug into the hub inside of
 *      this function, which would require adding a few parameters, (ii) split
 *      the function into a create part and a registration part. Both ways are
 *      not consistent with how driver.c plugs the ports into the hub.
 *
 * Results:
 *      Errno. Also returns an allocated jack to connect to,
 *      NULL on error.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

int
VNetBridge_Create(const char *devName, // IN:  name of device (e.g., "eth0")
                  uint32 flags,        // IN:  configuration flags
                  VNetJack *hubJack,   // IN:  the future hub
                  VNetPort **ret)      // OUT: port to virtual hub
{
   VNetBridge *bridge = NULL;
   static unsigned id = 0;
   int retval = 0;

   *ret = NULL;

   /*
    * Its an error if device name is empty.
    */

   if (devName[0] == '\0') {
      retval = -EINVAL;
      goto out;
   }

   /* complain about unknown/unsupported flags */
   if (flags & ~VNET_BRFLAG_FORCE_SMAC) {
      retval = -EINVAL;
      goto out;
   }

   /*
    * Allocate bridge structure
    */

   bridge = kmalloc(sizeof *bridge, GFP_USER);
   if (bridge == NULL) {
      retval = -ENOMEM;
      goto out;
   }
   memset(bridge, 0, sizeof *bridge);
   spin_lock_init(&bridge->historyLock);
   memcpy(bridge->name, devName, sizeof bridge->name);
   NULL_TERMINATE_STRING(bridge->name);

   /*
    * Initialize jack.
    */

   bridge->port.id = id++;
   bridge->port.next = NULL;

   bridge->port.jack.peer = NULL;
   bridge->port.jack.numPorts = 1;
   VNetSnprintf(bridge->port.jack.name, sizeof bridge->port.jack.name,
		"bridge%u", bridge->port.id);
   bridge->port.jack.private = bridge;
   bridge->port.jack.index = 0;
   bridge->port.jack.procEntry = NULL;
   bridge->port.jack.free = VNetBridgeFree;
   bridge->port.jack.rcv = VNetBridgeReceiveFromVNet;
   bridge->port.jack.cycleDetect = VNetBridgeCycleDetect;
   bridge->port.jack.portsChanged = VNetBridgePortsChanged;
   bridge->port.jack.isBridged = VNetBridgeIsBridged;

   /*
    * Make proc entry for this jack.
    */

   retval = VNetProc_MakeEntry(bridge->port.jack.name, S_IFREG, bridge,
                               VNetBridgeProcRead,
                               &bridge->port.jack.procEntry);
   if (retval) {
      if (retval == -ENXIO) {
         bridge->port.jack.procEntry = NULL;
      } else {
         goto out;
      }
   }

   /*
    * Rest of fields.
    */

   bridge->port.flags = IFF_RUNNING;

   memset(bridge->port.paddr, 0, sizeof bridge->port.paddr);
   memset(bridge->port.ladrf, 0, sizeof bridge->port.ladrf);

   bridge->port.paddr[0] = VMX86_STATIC_OUI0;
   bridge->port.paddr[1] = VMX86_STATIC_OUI1;
   bridge->port.paddr[2] = VMX86_STATIC_OUI2;

   bridge->port.fileOpRead = NULL;
   bridge->port.fileOpWrite = NULL;
   bridge->port.fileOpIoctl = NULL;
   bridge->port.fileOpPoll = NULL;

   /* misc. configuration */
   bridge->forceSmac = (flags & VNET_BRFLAG_FORCE_SMAC) ? TRUE : FALSE;

   /* create event sender */
   retval = VNetHub_CreateSender(hubJack, &bridge->eventSender);
   if (retval != 0) {
      goto out;
   }

   /*
    * on RHEL3 Linux 2.4.21-47 (others maybe too) the notifier does not fire
    * and bring up the bridge as expected, thus we bring it up manually
    * *before* registering the notifier (PR306435)
    */
   VNetBridgeUp(bridge, TRUE);

   /*
    * register notifier for network device state change notifications, the
    * notifier will fire right away, and the notifier handler will bring up
    * the bridge (see exception above)
    */
   bridge->notifier.notifier_call = VNetBridgeNotify;
   bridge->notifier.priority = 0;
   register_netdevice_notifier(&bridge->notifier);

   /* return bridge */
   *ret = &bridge->port;
   LOG(1, (KERN_DEBUG "bridge-%s: attached\n", bridge->name));
   return 0;

out:
   if (bridge != NULL) {
      kfree(bridge);
   }
   return retval;
}


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeFree --
 *
 *      Unregister from device state notifications, disable the bridge,
 *      destroy sender, remove proc entry, cleanup smac, and deallocate
 *      struct.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

void
VNetBridgeFree(VNetJack *this) // IN: jack to free
{
   VNetBridge *bridge = (VNetBridge*)this->private;

   /* unregister notifier */
   if (bridge->notifier.notifier_call != NULL) {
      int err;

      err = compat_unregister_netdevice_notifier(&bridge->notifier);
      if (err != 0) {
         LOG(0, (KERN_NOTICE "Can't unregister netdevice notifier (%d)\n",
                 err));
      }
      bridge->notifier.notifier_call = NULL;
   }

   /* disable bridge */
   if (bridge->dev != NULL) {
      LOG(1, (KERN_DEBUG "bridge-%s: disabling the bridge\n", bridge->name));
      VNetBridgeDown(bridge, TRUE);
   }

   /* destroy event sender */
   VNetEvent_DestroySender(bridge->eventSender);
   bridge->eventSender = NULL;

   /* remove /proc entry */
   if (this->procEntry) {
      VNetProc_RemoveEntry(this->procEntry);
   }

   if (bridge->smac){
      SMAC_CleanupState(&(bridge->smac));
   }

   /* free bridge */
   LOG(1, (KERN_DEBUG "bridge-%s: detached\n", bridge->name));
   kfree(bridge);
}


/*
 *----------------------------------------------------------------------
 *
 * VNetCallSMACFunc --
 *
 *      Wrapper for SMAC functions. The skb must be linear.
 *
 * Results:
 *      Packet Status.
 *
 * Side effects:
 *      The skb buffer is freed if not successful otherwise it points to
 *      the clone.
 *
 *----------------------------------------------------------------------
 */

static PacketStatus
VNetCallSMACFunc(struct SMACState *state, // IN: pointer to state
                 struct sk_buff **skb,    // IN/OUT: packet to process
                 void *startOfData,       // IN: points to start of data
                 SMACFunc func,           // IN: function to be called
                 unsigned int len)        // IN: length including ETH header
{
   SMACPackets packets = { {0} };
   PacketStatus status;

   SKB_LINEAR_ASSERT(*skb);

   packets.orig.skb = *skb;
   packets.orig.startOfData = startOfData;
   packets.orig.len = len;

   status = func(state, &packets);
   if (status != PacketStatusForwardPacket) {
      dev_kfree_skb(*skb);
      return status;
   }

   if (packets.clone.skb) {
      dev_kfree_skb(*skb);
      *skb = packets.clone.skb;
   }
   return status;
}


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeReceiveFromVNet --
 *
 *      This jack is receiving a packet from a vnet.  This function
 *      sends down (i.e., out on the host net device) if the packet
 *      isn't destined for the host, and it sends up (i.e.,
 *      simulates a receive for the host) if the packet
 *      satisfies the host's packet filter.
 *
 *      When the function sends up it keeps a reference to the
 *      packet in a history list so that we can avoid handing
 *      a VM a copy of its own packet.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      Frees skb.  Checks if host device is still using
 *      promiscuous mode.
 *
 *----------------------------------------------------------------------
 */

void
VNetBridgeReceiveFromVNet(VNetJack        *this, // IN: jack
                          struct sk_buff  *skb)  // IN: pkt to receive
{
   VNetBridge *bridge = (VNetBridge*)this->private;
   struct net_device *dev = bridge->dev;
   uint8 dest[ETH_ALEN];
   struct sk_buff *clone;

   LOG(3, (KERN_DEBUG "bridge-%s: transmit %d\n",
           bridge->name, (int) skb->len));

   if (!dev) {
      dev_kfree_skb(skb);
      return;
   }

   /*
    * skb might be freed by wireless code, so need to keep
    * a local copy of the MAC rather than a pointer to it.
    */

   memcpy(dest, SKB_2_DESTMAC(skb), ETH_ALEN);

#ifdef notdef
   // xxx;
   /*
    * We need to send the packet both up to the host and down
    * to the interface.
    * However, we ignore packets destined only for this hub.
    */

   for (i = 0; i < VNET_PORTS_PER_HUB; i++) {
      VNetPort *p = &port->hub->port[i];
      if (UP_AND_RUNNING(p->flags) && MAC_EQ(dest, p->paddr)) {
	 return;
      }
   }
#endif

   /*
    * SMAC processing. SMAC interfaces that the skb is linear, so ensure that
    * this is the case prior to calling out.
    */

   if (bridge->smac) {
      if (compat_skb_is_nonlinear(skb) && compat_skb_linearize(skb)) {
         LOG(4, (KERN_NOTICE "bridge-%s: couldn't linearize, packet dropped\n",
                 bridge->name));
         return;
      }
      if (VNetCallSMACFunc(bridge->smac, &skb, skb->data,
                           SMAC_CheckPacketToHost, skb->len) !=
          PacketStatusForwardPacket) {
         LOG(4, (KERN_NOTICE "bridge-%s: packet dropped\n", bridge->name));
	 return;
      }
   }

   /*
    * Send down (imitate packet_sendmsg)
    *
    * Do this only if the packet is not addressed to the peer,
    * and the packet size is not too big.
    */

   dev_lock_list();
   if (MAC_EQ(dest, dev->dev_addr) ||
       skb->len > dev->mtu + dev->hard_header_len) {
      dev_unlock_list();
   } else {
#     if 0 // XXX we should do header translation
      if ((dev->flags & IFF_SOFTHEADERS) != 0) {
	 if (skb->len > dev->mtu) {
	    clone = NULL;
	 } else {
	    clone = dev_alloc_skb(skb->len + dev->hard_header_len, GFP_ATOMIC);
	 }
	 if (clone != NULL) {
	    skb_reserve(clone, dev->hard_header_len);
	    if (dev->hard_header != NULL) {
	       dev->hard_header(clone, dev, ETH_P_IP, NULL, NULL, skb->len);
	    }
	    memcpy(skb_put(clone, skb->len), skb->data, skb->len);
	 }
      }
#     endif
      clone = skb_clone(skb, GFP_ATOMIC);
      if (clone == NULL) {
	 dev_unlock_list();
      } else {
         skb_set_owner_w(clone, bridge->sk);
	 clone->protocol = ((struct ethhdr *)skb->data)->h_proto; // XXX
	 if ((dev->flags & IFF_UP) != 0) {
	    dev_unlock_list();
	    DEV_QUEUE_XMIT(clone, dev, 0);
	 } else {
	    dev_unlock_list();
	    dev_kfree_skb(clone);
	 }
      }
   }

   /*
    * Send up (imitate Ethernet receive)
    *
    * Do this if the packet is addressed to the peer (or is broadcast, etc.).
    *
    * This packet will get back to us, via VNetBridgeReceive.
    * We save it so we can recognize it (and its clones) again.
    */

   if (VNetPacketMatch(dest, dev->dev_addr, NULL, 0, allMultiFilter, dev->flags)) {
      clone = skb_clone(skb, GFP_ATOMIC);
      if (clone) {
	 unsigned long flags;
	 int i;

	 clone = skb_get(clone);

	 clone->dev = dev;
	 clone->protocol = eth_type_trans(clone, dev);
	 spin_lock_irqsave(&bridge->historyLock, flags);
	 for (i = 0; i < VNET_BRIDGE_HISTORY; i++) {
	    if (bridge->history[i] == NULL) {
	       bridge->history[i] = clone;
#	       if LOGLEVEL >= 3
	       {
		  int j;
		  int count = 0;
		  for (j = 0; j < VNET_BRIDGE_HISTORY; j++) {
		     if (bridge->history[j] != NULL) {
			count++;
		     }
		  }
		  LOG(3, (KERN_DEBUG "bridge-%s: host slot %d history %d\n",
			  bridge->name, i, count));
	       }
#	       endif
	       break;
	    }
	 }
	 if (i >= VNET_BRIDGE_HISTORY) {
	    LOG(1, (KERN_NOTICE "bridge-%s: history full\n",
		    bridge->name));

	    for (i = 0; i < VNET_BRIDGE_HISTORY; i++) {
	       struct sk_buff *s = bridge->history[i];

	       /*
		* We special case 0 to avoid races with another thread on
		* another cpu wanting to use the 0 entry. This could happen
		* when we release the lock to free the former entry.
		* See bug 11231 for details.
		*/
	       if (i == 0) {
		  bridge->history[0] = clone;
	       } else {
		  bridge->history[i] = NULL;
	       }
	       if (s) {
	       	  spin_unlock_irqrestore(&bridge->historyLock, flags);
		  dev_kfree_skb(s);
		  spin_lock_irqsave(&bridge->historyLock, flags);
	       }
	    }
	 }
         spin_unlock_irqrestore(&bridge->historyLock, flags);

         /*
          * We used to cli() before calling netif_rx() here. It was probably
          * unneeded (as we never did it in netif.c, and the code worked). In
          * any case, now that we are using netif_rx_ni(), we should certainly
          * not do it, or netif_rx_ni() will deadlock on the cli() lock --hpreg
          */

	 netif_rx_ni(clone);
#	 if LOGLEVEL >= 4
	 do_gettimeofday(&vnetTime);
#	 endif
      }
   }

   // xxx;
   dev_kfree_skb(skb);
}


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeCycleDetect --
 *
 *      Cycle detection algorithm.
 *
 * Results:
 *      TRUE if a cycle was detected, FALSE otherwise.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

Bool
VNetBridgeCycleDetect(VNetJack *this,       // IN: jack
                      int       generation) // IN: generation
{
   VNetBridge *bridge = (VNetBridge*)this->private;
   return VNetCycleDetectIf(bridge->name, generation);
}


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgePortsChanged --
 *
 *      The number of ports connected to this jack has change, react
 *      accordingly by starting/stopping promiscuous mode based on
 *      whether any peers exist.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      Promiscuous mode may be started or stopped.
 *
 *----------------------------------------------------------------------
 */

void
VNetBridgePortsChanged(VNetJack *this) // IN: jack
{
   VNetBridge *bridge = (VNetBridge*)this->private;
   if (bridge->dev) {
      if (VNetGetAttachedPorts(this)) {
         VNetBridgeStartPromisc(bridge, TRUE);
      } else {
         VNetBridgeStopPromisc(bridge, TRUE);
      }
   }
}


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeIsBridged --
 *
 *      Reports if the bridged interface is up or down.
 *
 * Results:
 *      1 - we are bridged but the interface is not up
 *      2 - we are bridged and the interface is up
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

int
VNetBridgeIsBridged(VNetJack *this) // IN: jack
{
   VNetBridge *bridge = (VNetBridge*)this->private;
   if (bridge->dev) {
      return 2;
   } else {
      return 1;
   }
}

/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeIsDeviceWireless --
 *
 *      Check if the device is a wireless adapter, depending on the version
 *      of the wireless extension present in the kernel.
 *
 * Results:
 *      TRUE if the device is wireless, FALSE otherwise.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

static Bool
VNetBridgeIsDeviceWireless(struct net_device *dev) //IN: sock
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 22)
#  if defined(CONFIG_WIRELESS_EXT)
   return dev->ieee80211_ptr != NULL || dev->wireless_handlers != NULL;
#  else
   return dev->ieee80211_ptr != NULL;
#  endif
#elif defined(CONFIG_WIRELESS_EXT)
   return dev->wireless_handlers != NULL;
#elif !defined(CONFIG_NET_RADIO)
   return FALSE;
#elif defined WIRELESS_EXT && WIRELESS_EXT > 19
   return dev->wireless_handlers != NULL;
#elif defined WIRELESS_EXT && WIRELESS_EXT > 12
   return dev->wireless_handlers != NULL || dev->get_wireless_stats != NULL;
#else
   return dev->get_wireless_stats != NULL;
#endif
}


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeSendLinkStateEvent --
 *
 *      Sends a link state event.
 *
 * Results:
 *      Returns 0 if successful, or a negative value if an error occurs.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */
static int
VNetBridgeSendLinkStateEvent(VNetBridge *bridge, // IN: the bridge
                             uint32 adapter,     // IN: the adapter
                             Bool up)            // IN: the link state
{
   VNet_LinkStateEvent event;
   int res;

   event.header.size = sizeof event;
   res = VNetEvent_GetSenderId(bridge->eventSender, &event.header.senderId);
   if (res != 0) {
      LOG(1, (KERN_NOTICE "bridge-%s: can't send link state event, "
              "getSenderId failed (%d)\n", bridge->name, res));
      return res;
   }
   event.header.eventId = 0;
   event.header.classSet = VNET_EVENT_CLASS_UPLINK;
   event.header.type = VNET_EVENT_TYPE_LINK_STATE;
   event.adapter = adapter;
   event.up = up;
   res = VNetEvent_Send(bridge->eventSender, &event.header);
   if (res != 0) {
      LOG(1, (KERN_NOTICE "bridge-%s: can't send link state event, send "
              "failed (%d)\n", bridge->name, res));
   }
   return res;
}


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeUp --
 *
 *      Bring a bridge up.  Gets peer's device structure, verifies
 *      that interface is up, checks the header length,
 *      allocates a socket, adds a packet handler to the network
 *      stack, and then places the peer's device in promiscuous
 *      mode.
 *
 * Results:
 *      errno.
 *
 * Side effects:
 *      Bridging may be brought up with a peer interface.
 *
 *----------------------------------------------------------------------
 */

static int
VNetBridgeUp(VNetBridge *bridge, // IN: bridge struct
             Bool rtnlLock)      // IN: acquire RTNL lock
{
   int retval = 0;

   if (bridge->dev != NULL) {
      LOG(0, (KERN_NOTICE "bridge-%s: already up\n", bridge->name));
      goto out;
   }

   /*
    * Get peer device structure
    */

   dev_lock_list();
   bridge->dev = DEV_GET(bridge);
   LOG(2, (KERN_DEBUG "bridge-%s: got dev %p\n",
	   bridge->name, bridge->dev));
   if (bridge->dev == NULL) {
      dev_unlock_list();
      retval = -ENODEV;
      goto out;
   }
   if (!(bridge->dev->flags & IFF_UP)) {
      LOG(2, (KERN_DEBUG "bridge-%s: interface %s is not up\n",
              bridge->name, bridge->dev->name));
      dev_unlock_list();
      retval = -ENODEV;
      goto out;
   }

   /*
    * At a minimum, the header size should be the same as ours.
    *
    * XXX we should either do header translation or ensure this
    * is an Ethernet.
    */

   if (bridge->dev->hard_header_len != ETH_HLEN) {
      LOG(1, (KERN_DEBUG "bridge-%s: can't bridge with %s, bad header length %d\n",
	      bridge->name, bridge->dev->name, bridge->dev->hard_header_len));
      dev_unlock_list();
      retval = -EINVAL;
      goto out;
   }

   /*
    * Get a socket to play with
    *
    * We set the dead field so we don't get a call back from dev_kfree_skb().
    * (The alternative is to support the callback.)
    */

   bridge->sk = compat_sk_alloc(bridge, GFP_ATOMIC);
   if (bridge->sk == NULL) {
      dev_unlock_list();
      retval = -ENOMEM;
      goto out;
   }
   sock_init_data(NULL, bridge->sk);
   sock_set_flag(bridge->sk, SOCK_DEAD);

   if (VNetBridgeIsDeviceWireless(bridge->dev)) {
      LOG(1, (KERN_NOTICE "bridge-%s: device is wireless, enabling SMAC\n",
              bridge->name));
      bridge->wirelessAdapter = TRUE;
   }

   /*
    * If it is a wireless adapter initialize smac struct.
    */

   if (bridge->wirelessAdapter || bridge->forceSmac) {
      SMAC_InitState(&(bridge->smac));
      if (bridge->smac) {
         /*
          * Store the MAC address of the adapter
          */

         SMAC_SetMac(bridge->smac, bridge->dev->dev_addr);
      }
   }

   /*
    * Link up with the peer device by adding a
    * packet handler to the networking stack.
    */

   bridge->pt.func = VNetBridgeReceiveFromDev;
   bridge->pt.type = htons(ETH_P_ALL);
   bridge->pt.dev = bridge->dev;

   bridge->pt.af_packet_priv = bridge->sk;
   bridge->enabledPromisc = FALSE;
   bridge->warnPromisc = FALSE;
   dev_add_pack(&bridge->pt);
   dev_unlock_list();

   /*
    * Put in promiscuous mode if need be.
    */

   mutex_lock(&vnetStructureMutex);
   if (VNetGetAttachedPorts(&bridge->port.jack)) {
      VNetBridgeStartPromisc(bridge, rtnlLock);
   }
   mutex_unlock(&vnetStructureMutex);

   /* send link state up event */
   retval = VNetBridgeSendLinkStateEvent(bridge, bridge->dev->ifindex, TRUE);
   if (retval != 0) {
      LOG(1, (KERN_NOTICE "bridge-%s: can't send link state event (%d)\n",
              bridge->name, retval));
      goto out;
   }

   LOG(1, (KERN_DEBUG "bridge-%s: up\n", bridge->name));

   /*
    * Return
    */

out:
   if (retval != 0) {
      if (bridge->sk != NULL) {
	 sk_free(bridge->sk);
	 bridge->sk = NULL;
      }
      bridge->dev = NULL;
   }
   return retval;
}


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeDown --
 *
 *      Bring a bridge down.  Stops promiscuous mode, removes the
 *      packet handler from the network stack, and frees the
 *      socket.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      Bridging is brought down.
 *
 *----------------------------------------------------------------------
 */

static void
VNetBridgeDown(VNetBridge *bridge, // IN: bridge
               Bool rtnlLock)      // IN: acquire RTNL lock
{
   int retval;

   if (bridge->dev == NULL) {
      LOG(0, (KERN_NOTICE "bridge-%s: already down\n", bridge->name));
      return;
   }

   /* send link state down event */
   retval = VNetBridgeSendLinkStateEvent(bridge, bridge->dev->ifindex, FALSE);
   if (retval != 0) {
      LOG(1, (KERN_NOTICE "bridge-%s: can't send link state event (%d)\n",
              bridge->name, retval));
   }

   VNetBridgeStopPromisc(bridge, rtnlLock);
   if (bridge->smac){
      SMAC_SetMac(bridge->smac, NULL);
   }
   bridge->dev = NULL;
   dev_remove_pack(&bridge->pt);
   sk_free(bridge->sk);
   bridge->sk = NULL;

   LOG(1, (KERN_DEBUG "bridge-%s: down\n", bridge->name));
}


/*
 *-----------------------------------------------------------------------------
 *
 * VNetBridgeNotifyLogBridgeUpError --
 *
 *      Logs a bridge up error for the notify function following this function.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

static void
VNetBridgeNotifyLogBridgeUpError(int errno,        // IN: the error number
                                 char *bridgeName, // IN: the bridge name
                                 char *devName)    // IN: the device name
{
   switch (errno) {
      case -ENODEV:
         LOG(0, (KERN_WARNING "bridge-%s: interface %s not found or not "
                 "up\n", bridgeName, devName));
         break;
      case -EINVAL:
         LOG(0, (KERN_WARNING "bridge-%s: interface %s is not a valid "
                 "Ethernet interface\n", bridgeName, devName));
         break;
      case -ENOMEM:
         LOG(0, (KERN_WARNING "bridge-%s: failed to allocate memory\n",
                 bridgeName));
         break;
      default:
         /* This should never happen --hpreg */
         LOG(0, (KERN_WARNING "bridge-%s: failed to enable the bridge to "
                 "interface %s (error %d)\n", bridgeName, devName,
                 -errno));
         break;
   }
}


/*
 *-----------------------------------------------------------------------------
 *
 * VNetBridgeNotify --
 *
 *      Callback on peer device state change.  The function brings
 *      the bridge up/down in response to changes in the peer device.
 *
 * Results:
 *      NOTIFY_DONE
 *
 * Side effects:
 *      Promiscuous mode is changed when bridge brought up/down.
 *
 *-----------------------------------------------------------------------------
 */

static int
VNetBridgeNotify(struct notifier_block *this, // IN: callback data (bridge)
                 u_long msg,                  // IN: type of event
                 void *data)                  // IN: net_device or notifier info
{
   VNetBridge *bridge = list_entry(this, VNetBridge, notifier);
   struct net_device *dev;

#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 11, 0)
   dev = netdev_notifier_info_to_dev(data);
#else
   dev = (struct net_device *)data;
#endif
   switch (msg) {
   case NETDEV_UNREGISTER:
      LOG(2, (KERN_DEBUG "bridge-%s: interface %s is unregistering\n",
              bridge->name, dev->name));
      if (dev == bridge->dev) {
         /* This should never happen --hpreg */
         LOG(0, (KERN_WARNING "bridge-%s: interface %s unregistered without "
                 "going down! Disabling the bridge\n", bridge->name,
                 dev->name));
         VNetBridgeDown(bridge, FALSE);
      }
      break;

   case NETDEV_DOWN:
      LOG(2, (KERN_DEBUG "bridge-%s: interface %s is going down\n",
              bridge->name, dev->name));
      if (dev == bridge->dev) {
         LOG(1, (KERN_DEBUG "bridge-%s: disabling the bridge on dev down\n",
                 bridge->name));
         VNetBridgeDown(bridge, FALSE);
      }
      break;

   case NETDEV_UP:
      LOG(2, (KERN_DEBUG "bridge-%s: interface %s is going up\n",
              bridge->name, dev->name));
      if (bridge->dev == NULL && VNetBridgeDevCompatible(bridge, dev)) {
         int errno;

         LOG(1, (KERN_DEBUG "bridge-%s: enabling the bridge on dev up\n",
                 bridge->name));
         errno = VNetBridgeUp(bridge, FALSE);
         if (errno != 0) {
            VNetBridgeNotifyLogBridgeUpError(errno, bridge->name, dev->name);
         }
      }
      break;

   default:
      LOG(2, (KERN_DEBUG "bridge-%s: interface %s is sending notification "
              "0x%lx\n", bridge->name, dev->name, msg));
      break;
   }

   return NOTIFY_DONE;
}


/*
 *----------------------------------------------------------------------
 *
 * RangeInLinearSKB --
 *
 *      Checks if the given number of bytes from a given offset resides
 *      within the linear part of the skb.  If not then attempts to
 *      linearize the skb.
 *
 * Results:
 *      Returns TRUE if the range of bytes is already in the linear
 *      portion or if linearize succeeded.  Otherwise, returns FALSE if
 *      the linearize operation fails.
 *
 * Side effects:
 *      As in skb_linearize().
 *
 *----------------------------------------------------------------------
 */

static INLINE_SINGLE_CALLER Bool
RangeInLinearSKB(struct sk_buff *skb, // IN:
                 unsigned int start,  // IN:  Start offset
                 unsigned int length) // IN:  How many bytes
{
   if (LIKELY(!compat_skb_is_nonlinear(skb) ||
              start + length <= compat_skb_headlen(skb))) {
      /*
       * Nothing to do.
       */

      return TRUE;
   }

   return compat_skb_linearize(skb) == 0;
}


/*
 * Not all kernel versions have NEXTHDR_MOBILITY defined.
 */

#ifndef NEXTHDR_MOBILITY
#  define NEXTHDR_MOBILITY 135 /* Mobility header. */
#endif


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeComputeHeaderPosIPv6 --
 *
 *      Compute correct position of transport header in IPv6 packets.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      Transport header pointer updated to point to the PDU contained
 *      in the packet.
 *
 *----------------------------------------------------------------------
 */

static void
VNetBridgeComputeHeaderPosIPv6(struct sk_buff *skb) // IN:
{
   struct ipv6hdr *ipv6Hdr;
   unsigned int offset; /* Offset from skb->data. */
   unsigned int headerLen; /* Length of current header. */
   uint8 nextHeader;

   /*
    * Check if the start of the network header is within the linear part of
    * skb.  If not, then linearize the skb.
    */

   if (UNLIKELY(compat_skb_network_header(skb) < skb->data ||
                compat_skb_network_header(skb) >= skb->data +
                                                  compat_skb_headlen(skb))) {
      if (compat_skb_linearize(skb)) {
         return; /* Bail out. */
      }
   }

   offset = compat_skb_network_offset(skb);
   if (!RangeInLinearSKB(skb, offset, sizeof *ipv6Hdr)) {
      return; /* Bail out. */
   }

   ipv6Hdr = (struct ipv6hdr *)compat_skb_network_header(skb);
   headerLen = sizeof *ipv6Hdr;
   offset += headerLen; /* End of IPv6 header (not including extensions). */

   /*
    * All IPv6 extension headers begin with a "next header" field (one byte),
    * and most of them have a "header length" field (as the 2nd byte).  In each
    * iteration, we find the length of the extension header and add it to
    * offset from the beginning of skb.  And, in each iteration we update the
    * next header variable.  When we return from the following for loop, offset
    * would have incremented by the length of each of the extension header,
    * and next header type will be something else than an IPv6 extension header
    * signifying that we have walked through the entire IPv6 header.  We set
    * the transport header's offset to the value of this offset before exiting
    * the for loop.
    */

   nextHeader = ipv6Hdr->nexthdr;
   for (;;) {
      switch (nextHeader) {
      case NEXTHDR_HOP:
      case NEXTHDR_ROUTING:
      case NEXTHDR_AUTH:
      case NEXTHDR_DEST:
      case NEXTHDR_MOBILITY:
         /*
          * We need to check two bytes in the option header:  next header and
          * header extension length.
          */

         if (!RangeInLinearSKB(skb, offset, 2)) {
            return; /* Bail out. */
         }
         headerLen = skb->data[offset + 1];
         if (nextHeader == NEXTHDR_AUTH) {
            headerLen = (headerLen + 2) << 2; /* See RFC 2402. */
         } else {
            headerLen = (headerLen + 1) << 3; /* See ipv6_optlen(). */
         }

         break;

      case NEXTHDR_FRAGMENT:
      case NEXTHDR_ESP:
      case NEXTHDR_NONE:
         /*
          * We stop walking if we find the fragment header (NEXTHDR_FRAGMENT).
          * If the payload is encrypted we may not know the start of the
          * transport header [1].  So, we just return.  Same applies when
          * nothing follows this header (NEXTHDR_NONE).
          * [1]:  http://www.cu.ipv6tf.org/literatura/chap8.pdf
          */

         return;

       default:
         /*
          * We have walked through all IPv6 extension headers.  Let's set the
          * transport header and return.
          */

         compat_skb_set_transport_header(skb, offset);
         return;
      }

      nextHeader = skb->data[offset];
      offset += headerLen;
   }
}


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeComputeHeaderPos --
 *
 *      Compute correct position for UDP/TCP header.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      transport header pointer updated to point to the tcp/udp header.
 *
 *----------------------------------------------------------------------
 */

static INLINE_SINGLE_CALLER void
VNetBridgeComputeHeaderPos(struct sk_buff *skb) // IN: buffer to examine
{
   /* Maybe some kernel gets it right... */
   if (compat_skb_network_header_len(skb)) {
      return;
   }
   switch (be16_to_cpu(skb->protocol)) {
      case ETH_P_IP: {
         struct iphdr *ipHdr = compat_skb_ip_header(skb);

         compat_skb_set_transport_header(skb, compat_skb_network_offset(skb) +
                                              ipHdr->ihl * 4);
         break;
      }

      case ETH_P_IPV6:
         VNetBridgeComputeHeaderPosIPv6(skb);
         break;

      default:
         LOG(3, (KERN_DEBUG "Unknown EII protocol %04X: csum at %d\n",
                 be16_to_cpu(skb->protocol), compat_skb_csum_offset(skb)));
         break;
   }
}


/*
 * We deal with three types of kernels:
 * New kernels: skb_shinfo() has gso_size member, and there is
 *              skb_gso_segment() helper to split GSO skb into flat ones.
 * Older kernels: skb_shinfo() has tso_size member, and there is
 *                no helper.
 * Oldest kernels: without any segmentation offload support.
 */
#if defined(NETIF_F_GSO) || LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 18)
#define VNetBridgeIsGSO(skb) skb_shinfo(skb)->gso_size
#define VNetBridgeGSOSegment(skb) skb_gso_segment(skb, 0)
#elif defined(NETIF_F_TSO)
#define VNetBridgeIsGSO(skb) skb_shinfo(skb)->tso_size


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeGSOSegment --
 *
 *	Split a large TCP/IPv4 sk_buff into multiple sk_buffs of
 *	size skb_shinfo(skb)->tso_size
 *	Called from VNetBridgeSendLargePacket().
 *
 * Results:
 *	List of skbs created.
 *
 * Side effects:
 *	The incoming packet is split into multiple packets.
 *
 *----------------------------------------------------------------------
 */

static struct sk_buff *
VNetBridgeGSOSegment(struct sk_buff *skb)        // IN: packet to split
{
   struct sk_buff *segs = NULL;
   struct sk_buff **next = &segs;
   int bytesPerPacket, bytesLeft;
   int macHdrLen, ipHdrLen, tcpHdrLen, allHdrLen;
   int curByteOffset;
   uint16 ipID;
   uint32 seqNo;

   if (((struct ethhdr *)compat_skb_mac_header(skb))->h_proto != htons(ETH_P_IP)) {
      return ERR_PTR(-EPFNOSUPPORT);
   }

   if (compat_skb_ip_header(skb)->protocol != IPPROTO_TCP) {
      return ERR_PTR(-EPROTONOSUPPORT);
   }

   macHdrLen = compat_skb_network_header(skb) - compat_skb_mac_header(skb);
   ipHdrLen = compat_skb_ip_header(skb)->ihl << 2;
   tcpHdrLen = compat_skb_tcp_header(skb)->doff << 2;
   allHdrLen = macHdrLen + ipHdrLen + tcpHdrLen;

   ipID = ntohs(compat_skb_ip_header(skb)->id);
   seqNo = ntohl(compat_skb_tcp_header(skb)->seq);

   /* Host TCP stack populated this (MSS) for the host NIC driver */
   bytesPerPacket = skb_shinfo(skb)->tso_size;

   bytesLeft = skb->len - allHdrLen;
   curByteOffset = allHdrLen;

   while (bytesLeft) {
      struct sk_buff *newSkb;
      int payloadSize = (bytesLeft < bytesPerPacket) ? bytesLeft : bytesPerPacket;

      newSkb = dev_alloc_skb(payloadSize + allHdrLen + NET_IP_ALIGN);
      if (!newSkb) {
         while (segs) {
            newSkb = segs;
            segs = segs->next;
            newSkb->next = NULL;
            dev_kfree_skb(newSkb);
         }
         return ERR_PTR(-ENOMEM);
      }
      skb_reserve(newSkb, NET_IP_ALIGN);
      newSkb->dev = skb->dev;
      newSkb->protocol = skb->protocol;
      newSkb->pkt_type = skb->pkt_type;
      newSkb->ip_summed = VM_TX_CHECKSUM_PARTIAL;

      /*
       * MAC+IP+TCP copy
       * This implies that ALL fields in the IP and TCP headers are copied from
       * the original skb. This is convenient: we'll only fix up fields that
       * need to be changed below
       */
      memcpy(skb_put(newSkb, allHdrLen), skb->data, allHdrLen);

      /* Fix up pointers to different layers */
      compat_skb_reset_mac_header(newSkb);
      compat_skb_set_network_header(newSkb, macHdrLen);
      compat_skb_set_transport_header(newSkb, macHdrLen + ipHdrLen);

      /* Payload copy */
      skb_copy_bits(skb, curByteOffset, compat_skb_tail_pointer(newSkb), payloadSize);
      skb_put(newSkb, payloadSize);

      curByteOffset+=payloadSize;
      bytesLeft -= payloadSize;

      /* Fix up IP hdr */
      compat_skb_ip_header(newSkb)->tot_len = htons(payloadSize + tcpHdrLen + ipHdrLen);
      compat_skb_ip_header(newSkb)->id = htons(ipID);
      compat_skb_ip_header(newSkb)->check = 0;
      /* Recompute new IP checksum */
      compat_skb_ip_header(newSkb)->check =
              ip_fast_csum(compat_skb_network_header(newSkb),
                           compat_skb_ip_header(newSkb)->ihl);

      /* Fix up TCP hdr */
      compat_skb_tcp_header(newSkb)->seq = htonl(seqNo);
      /* Clear FIN/PSH if not last packet */
      if (bytesLeft > 0) {
         compat_skb_tcp_header(newSkb)->fin = 0;
         compat_skb_tcp_header(newSkb)->psh = 0;
      }
      /* Recompute partial TCP checksum */
      compat_skb_tcp_header(newSkb)->check =
         ~csum_tcpudp_magic(compat_skb_ip_header(newSkb)->saddr,
                            compat_skb_ip_header(newSkb)->daddr,
                            payloadSize+tcpHdrLen, IPPROTO_TCP, 0);

      /* Offset of field */
      newSkb->csum = offsetof(struct tcphdr, check);

      /* Join packet to the list of segments */
      *next = newSkb;
      next = &newSkb->next;

      /* Bump up our counters */
      ipID++;
      seqNo += payloadSize;

   }
   return segs;
}
#else
#define VNetBridgeIsGSO(skb) (0)
#define VNetBridgeGSOSegment(skb) ERR_PTR(-ENOSYS)
#endif


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeSendLargePacket --
 *
 *      Split and send a large TCP/IPv4 sk_buff into multiple sk_buffs which
 *      fits on wire.  Called from VNetBridgeReceiveFromDev(), which is a
 *	protocol handler called from the bottom half, so steady as she
 *	goes...
 *
 *	skb passed in is deallocated by function.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      The incoming packet is split into multiple packets and sent to the
 *      vnet.
 *
 *----------------------------------------------------------------------
 */

void
VNetBridgeSendLargePacket(struct sk_buff *skb,        // IN: packet to split
                          VNetBridge *bridge)         // IN: bridge
{
   struct sk_buff *segs;

   segs = VNetBridgeGSOSegment(skb);
   dev_kfree_skb(skb);
   if (IS_ERR(segs)) {
      LOG(1, (KERN_DEBUG "bridge-%s: cannot segment packet: error %ld\n",
              bridge->name, PTR_ERR(segs)));
      return;
   }

   while (segs) {
      struct sk_buff *newSkb;

      newSkb = segs;
      segs = newSkb->next;
      newSkb->next = NULL;
      /* Send it along */
      skb = newSkb;
      VNetSend(&bridge->port.jack, newSkb);
   }
}


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeReceiveFromDev --
 *
 *      Receive a packet from a bridged peer device
 *
 *      This is called from the bottom half.  Must be careful.
 *
 * Results:
 *      errno.
 *
 * Side effects:
 *      A packet may be sent to the vnet.
 *
 *----------------------------------------------------------------------
 */

int
VNetBridgeReceiveFromDev(struct sk_buff *skb,         // IN: packet to receive
                         struct net_device *dev,      // IN: unused
                         struct packet_type *pt,      // IN: pt (pointer to bridge)
                         struct net_device *real_dev) // IN: real device, unused
{
   VNetBridge *bridge = list_entry(pt, VNetBridge, pt);
   int i;
   unsigned long flags;

   if (bridge->dev == NULL) {
      LOG(3, (KERN_DEBUG "bridge-%s: received %d closed\n",
	      bridge->name, (int) skb->len));
      dev_kfree_skb(skb);
      return -EIO;	// value is ignored anyway
   }

   /*
    * Check is this is a packet that we sent up to the host, and if
    * so then don't bother to receive the packet.
    */

   spin_lock_irqsave(&bridge->historyLock, flags);
   for (i = 0; i < VNET_BRIDGE_HISTORY; i++) {
      struct sk_buff *s = bridge->history[i];
      if (s != NULL &&
	  (s == skb || SKB_IS_CLONE_OF(skb, s))) {
	 bridge->history[i] = NULL;
	 spin_unlock_irqrestore(&bridge->historyLock, flags);
	 dev_kfree_skb(s);
	 LOG(3, (KERN_DEBUG "bridge-%s: receive %d self %d\n",
		 bridge->name, (int) skb->len, i));
	 dev_kfree_skb(skb);
	 return 0;
      }
   }
   spin_unlock_irqrestore(&bridge->historyLock, flags);

#  if LOGLEVEL >= 4
   {
      struct timeval now;
      do_gettimeofday(&now);
      LOG(3, (KERN_DEBUG "bridge-%s: time %d\n",
	      bridge->name,
	      (int)((now.tv_sec * 1000000 + now.tv_usec)
                    - (vnetTime.tv_sec * 1000000 + vnetTime.tv_usec))));
   }
#  endif

   /*
    * SMAC might linearize the skb, but linearizing a shared skb is a no-no,
    * so check for sharing before calling out to SMAC.
    */
   skb = skb_share_check(skb, GFP_ATOMIC);
   if (!skb) {
      return 0;
   }

   if (bridge->smac) {
      /*
       * Wireless driver processes the packet and processes the ethernet header
       * and the length is reduced by the amount. We need the raw ethernet
       * packet length hence add the ethernet header length for incoming
       * packets.
       *
       * Note that SMAC interfaces assume skb linearity.
       */
      if (compat_skb_is_nonlinear(skb) && compat_skb_linearize(skb)) {
         LOG(4, (KERN_NOTICE "bridge-%s: couldn't linearize, packet dropped\n",
                 bridge->name));
         return 0;
      }
      if (VNetCallSMACFunc(bridge->smac, &skb, compat_skb_mac_header(skb),
                           SMAC_CheckPacketFromHost, skb->len + ETH_HLEN) !=
          PacketStatusForwardPacket) {
         LOG(4, (KERN_NOTICE "bridge-%s: packet dropped\n", bridge->name));
	 return 0;
      }
   }

   /*
    * Unbelievable... Caller sets h.raw = nh.raw before invoking us...
    */
   VNetBridgeComputeHeaderPos(skb);

   skb_push(skb, skb->data - compat_skb_mac_header(skb));
   LOG(3, (KERN_DEBUG "bridge-%s: receive %d\n",
	   bridge->name, (int) skb->len));

   /*
    * If this is a large packet, chop chop chop (if supported)...
    */
   if (VNetBridgeIsGSO(skb)) {
      VNetBridgeSendLargePacket(skb, bridge);
   } else {
      VNetSend(&bridge->port.jack, skb);
   }

   return 0;
}


/*
 *----------------------------------------------------------------------
 *
 * VNetBridgeProcRead --
 *
 *      Callback for read operation on this bridge entry in vnets proc fs.
 *
 * Results:
 *      Length of read operation.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

int
VNetBridgeProcRead(char    *page,   // IN/OUT: buffer to write into
                   char   **start,  // OUT: 0 if file < 4k, else offset into page
                   off_t    off,    // IN: (unused) offset of read into the file
                   int      count,  // IN: (unused) maximum number of bytes to read
                   int     *eof,    // OUT: TRUE if there is nothing more to read
                   void    *data)   // IN: client data - pointer to bridge
{
   VNetBridge *bridge = (VNetBridge*)data;
   int len = 0;

   if (!bridge) {
      return len;
   }

   len += VNetPrintPort(&bridge->port, page+len);

   len += sprintf(page+len, "dev %s ", bridge->name);

   len += sprintf(page+len, "\n");

   *start = 0;
   *eof   = 1;
   return len;
}

Puis reprendre la procedure decrite precedement :

# cd /usr/lib/vmware/modules/source/vmmon-only ; make

etc ...

En esperant que ca fera gagner a certains le temps que j'ai perdu afin d'installer ce *&@@@$ de machin proprietaire !!!
Bonne soiree