6 files changed, 722 insertions, 0 deletions
diff --git a/.SRCINFO b/.SRCINFO
new file mode 100644
index 000000000000..8cef04b31e2a
--- /dev/null
+++ b/.SRCINFO
@@ -0,0 +1,34 @@
+pkgbase = vmd-src
+	pkgdesc = Visual Molecular Dynamics
+	pkgver = 1.9.3
+	pkgrel = 1
+	url = http://www.ks.uiuc.edu/Research/vmd/
+	arch = x86_64
+	license = custom
+	makedepends = gcc
+	depends = tcsh
+	depends = tk
+	depends = python2-numpy
+	depends = fltk
+	depends = ospray
+	depends = netcdf
+	depends = ocl-icd
+	optdepends = openbabel: additional file formats support
+	optdepends = sqlite: dmsplugin
+	optdepends = optix: accelerated ray tracing for NVIDIA GPUs
+	optdepends = cuda: NVIDIA CUDA GPU acceleration functions
+	conflicts = vmd
+	conflicts = vmd-bin
+	source = local://vmd-1.9.3.src.tar.gz
+	source = configure.patch
+	source = ospray.patch
+	source = cuda.patch
+	source = mpi.patch
+	sha256sums = 5bdc314dc836d620fe510ed4b6c3dbe3cf66525b61680ffec4e2563cf495f128
+	sha256sums = c25475d51be75c2b9e3bcd5f8d5d3ed1fd0df992f9c8020b0346c49f6c40b275
+	sha256sums = 09f1cf35ebd302095a68d2b5db03e1671493411aed3f839baf2b1f44dc7e0e0d
+	sha256sums = 8880006bd97f7397d5a5592aaeeb6e80530d8c6026280c1b961e43f568e12daa
+	sha256sums = e281a57831b8ff60c5a644219f0b6289d32bee239978af676474941c7d8548c0
+
+pkgname = vmd-src
+
diff --git a/PKGBUILD b/PKGBUILD
new file mode 100644
index 000000000000..fc9c5493cfe0
--- /dev/null
+++ b/PKGBUILD
@@ -0,0 +1,73 @@
+# Maintainer: Anton Kudelin <kudelin at protonmail dot com>
+# Contributor: Eric Berquist <eric DOT berquist AT gmail>
+# Contributor: steabert <steabert@member.fsf.org>
+# Contributor: Ricardo Honorato Z.
+
+pkgname=vmd-src
+_pkgname=vmd
+pkgver=1.9.3
+pkgrel=1
+pkgdesc="Visual Molecular Dynamics"
+url="http://www.ks.uiuc.edu/Research/vmd/"
+license=('custom')
+arch=('x86_64')
+depends=('tcsh' 'tk' 'python2-numpy' 'fltk' 'ospray' 'netcdf' 'ocl-icd')
+makedepends=('gcc')
+optdepends=('openbabel: additional file formats support'
+            'sqlite: dmsplugin'
+            'optix: accelerated ray tracing for NVIDIA GPUs'
+            'cuda: NVIDIA CUDA GPU acceleration functions')
+conflicts=("$_pkgname" "$_pkgname-bin")
+# You have to download the package from the VMD url and put it in the PKGBUILD folder.
+source=("local://$_pkgname-${pkgver}.src.tar.gz"
+        "configure.patch"
+        "ospray.patch"
+        "cuda.patch"
+        "mpi.patch")
+sha256sums=('5bdc314dc836d620fe510ed4b6c3dbe3cf66525b61680ffec4e2563cf495f128'
+            'c25475d51be75c2b9e3bcd5f8d5d3ed1fd0df992f9c8020b0346c49f6c40b275'
+            '09f1cf35ebd302095a68d2b5db03e1671493411aed3f839baf2b1f44dc7e0e0d'
+            '8880006bd97f7397d5a5592aaeeb6e80530d8c6026280c1b961e43f568e12daa'
+            'e281a57831b8ff60c5a644219f0b6289d32bee239978af676474941c7d8548c0')
+
+prepare() {
+  sed -i 's/ltcl8.5/ltcl/g' plugins/Make-arch
+  cd $_pkgname-$pkgver
+  mkdir plugins
+  sed -i 's#:${LD_LIBRARY_PATH}/:${LD_LIBRARY_PATH}:#/opt/optix/lib64#g' bin/*
+  patch -p0 < ../configure.patch
+  patch -p0 < ../ospray.patch
+  patch -p0 < ../cuda.patch
+  
+  # Assuming openmpi; if not the case edit mpi.patch
+  patch -p0 < ../mpi.patch
+  export TCLINC="-I/usr/include"
+  export TCLLIB="-L/usr/lib"
+  export PLUGINDIR=$srcdir/$_pkgname-$pkgver/plugins
+  export VMDINSTALLBINDIR=$pkgdir/usr/bin
+  export VMDINSTALLLIBRARYDIR=$pkgdir/usr/lib/vmd
+  
+  # Enable CUDA if nvcc is in PATH
+  if [ $( echo -n $( which nvcc) | tail -c 4 ) == nvcc ]
+  then
+    export ACC=CUDA
+  fi
+}
+
+build() {
+  cd $srcdir/plugins
+  make -j1 LINUXAMD64 world
+  make distrib
+  cd ../$_pkgname-$pkgver
+  ./configure LINUXAMD64 OPENGL EGLPBUFFER FLTKOPENGL FLTK TK $ACC IMD OPENCL MPI XINERAMA XINPUT LIBOSPRAY LIBPNG NETCDF COLVARS TCL PYTHON PTHREADS GCC
+  cd src
+  make veryclean
+  make
+}
+
+package() {
+  cd $srcdir/$_pkgname-$pkgver
+  install -D -m644 LICENSE $pkgdir/usr/share/licenses/$pkgname/LICENSE
+  cd src; make install
+  sed -i 's#set defaultvmddir=.*#set defaultvmddir=/usr/lib/vmd#' $pkgdir/usr/bin/vmd
+}
diff --git a/configure.patch b/configure.patch
new file mode 100644
index 000000000000..ad6dd0acb5e3
--- /dev/null
+++ b/configure.patch
@@ -0,0 +1,110 @@
+--- configure	2016-12-01 10:11:33.000000000 +0300
++++ configure	2019-05-24 15:40:14.742991210 +0300
+@@ -466,7 +466,7 @@
+ 
+ $arch_cc          = "cc";
+ $arch_ccpp        = "CC";
+-$arch_nvcc        = "/usr/local/cuda-8.0/bin/nvcc";
++$arch_nvcc        = "nvcc";
+ $arch_nvccflags   = "--ptxas-options=-v " . 
+                     "-gencode arch=compute_30,code=compute_30 " .
+                     "-gencode arch=compute_30,code=sm_35 " .
+@@ -729,8 +729,8 @@
+ if ($config_tk) { $tcl_include .= " -I$stock_tk_include_dir"; }
+ $tcl_library      = "-L$stock_tcl_library_dir";
+ if ($config_tk) { $tcl_library .= " -L$stock_tk_library_dir"; }
+-$tcl_libs         = "-ltcl8.5";  
+-if ($config_tk) { $tcl_libs = "-ltk8.5 -lX11 " . $tcl_libs; }
++$tcl_libs         = "-ltcl";  
++if ($config_tk) { $tcl_libs = "-ltk -lX11 " . $tcl_libs; }
+ 
+ @tcl_cc           = ();
+ @tcl_cu           = ();
+@@ -888,11 +888,7 @@
+ $system_dir           = "";
+ $system_include       = "-I.";
+ $system_library       = "";
+-if ( $config_gcc ) {
+-  $system_libs        = "-ll -lm";
+-} else {
+-  $system_libs        = "-lm";
+-}
++$system_libs        = "-lrt -lm";
+ @system_cc             = ();
+ @system_cu             = ();
+ @system_ccpp           = ();
+@@ -922,7 +918,7 @@
+ #   This option enables the use of CUDA GPU acceleration functions.
+ #######################
+ $cuda_defines     = "-DVMDCUDA -DMSMPOT_CUDA";
+-$cuda_dir         = "/usr/local/cuda-8.0";
++$cuda_dir         = "/opt/cuda";
+ $cuda_include     = "";
+ $cuda_library     = "";
+ $cuda_libs        = "-Wl,-rpath -Wl,\$\$ORIGIN/ -lcudart_static";
+@@ -1042,7 +1038,7 @@
+ # $liboptix_dir         = "/usr/local/encap/NVIDIA-OptiX-SDK-3.6.3-linux64";
+ # $liboptix_dir         = "/usr/local/encap/NVIDIA-OptiX-SDK-3.8.0-linux64";
+ # $liboptix_dir         = "/usr/local/encap/NVIDIA-OptiX-SDK-3.9.0-linux64";
+-$liboptix_dir         = "/usr/local/encap/NVIDIA-OptiX-SDK-4.0.1-linux64";
++$liboptix_dir         = "/opt/optix";
+ # NCSA Blue Waters
+ # $liboptix_dir         = "/u/sciteam/stonej/local/NVIDIA-OptiX-SDK-3.6.3-linux64";
+ # $liboptix_dir         = "/u/sciteam/stonej/local/NVIDIA-OptiX-SDK-3.8.0-linux64";
+@@ -1082,7 +1078,7 @@
+ $libospray_dir         = "/usr/local/ospray-1.1.1.x86_64.linux";
+ $libospray_include     = "-I$libospray_dir/include -I$libospray_dir/ospray/include ";
+ $libospray_library     = "-L$libospray_dir/lib ";
+-$libospray_libs        = "-lospray -lospray_common -lembree -ltbb -ltbbmalloc ";
++$libospray_libs        = "-lospray -lospray_common -lembree3 -ltbb -ltbbmalloc ";
+ # Both OptiX and OSPRay renderers use the Tachyon glwin code, 
+ # so we have to make sure we don't build/link it twice
+ if ($config_liboptix) {
+@@ -1375,20 +1371,20 @@
+ ###################
+ # location of Python library and include file.
+ # If left blank, standard system  directories will be searched.
+-#$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "/usr/local/include";
++$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "/usr/include/python2.7";
+ #$stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "/usr/local/lib";
+-$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "$vmd_library_dir/python/lib_$config_arch/include/python2.5";
+-$stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.5/config";
++#$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "$vmd_library_dir/python/lib_$config_arch/include/python2.7";
++#$stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.7/config";
+ 
+ #$stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "/usr/local/include";
+ #$stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "/usr/local/lib";
+-$stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "$vmd_library_dir/numpy/lib_$config_arch/include";
+-$stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.5/site-packages/numpy/core/include";
++#$stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "$vmd_library_dir/numpy/lib_$config_arch/include";
++#$stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.7/site-packages/numpy/core/include";
+ 
+ $python_defines     = "-DVMDPYTHON";
+ $python_include     = "-I$stock_python_include_dir -I$stock_numpy_include_dir -I$stock_numpy_library_dir";
+ $python_library     = "-L$stock_python_library_dir";
+-$python_libs        = "-lpython2.5 -lpthread";
++$python_libs        = "-lpython2.7 -lpthread";
+ @python_h           = ('PythonTextInterp.h',
+                        'VMDTkinterMenu.h',
+ 		       'py_commands.h',
+@@ -2278,9 +2274,9 @@
+       # for compiling with g++:
+       $arch_cc		= "gcc";
+       $arch_ccpp  	= "g++";
+-      $arch_opt_flag	= "-m64 -fno-for-scope -Wno-deprecated -Wall -O6 -ffast-math";
++      $arch_opt_flag	= "-m64 -fno-for-scope -Wno-deprecated -Wall -O3 -ffast-math";
+       $arch_depend_flag   = "-MM";
+-      $arch_copts	        = "-m64 -Wall -O6 -ffast-math";
++      $arch_copts	        = "-m64 -Wall -O3 -ffast-math";
+       $arch_template_repository = "foobar";
+ 
+       # so far only STATIC version tested
+@@ -2293,7 +2289,7 @@
+ 
+     if ($config_cuda) {
+       $arch_nvccflags   .= " --machine 64 -O3 $cuda_include";
+-      $cuda_library     = "-L/usr/local/cuda-8.0/lib64";
++      $cuda_library     = "-L/opt/cuda/lib64";
+     }
+ 
+     $arch_lex		= "flex"; # has problems with vendor lex
diff --git a/cuda.patch b/cuda.patch
new file mode 100644
index 000000000000..770360ec8d94
--- /dev/null
+++ b/cuda.patch
@@ -0,0 +1,426 @@
+--- src/CUDAMarchingCubes.cu	2018-03-30 18:52:25.467189457 +0300
++++ src/CUDAMarchingCubes.cu	2018-03-30 18:52:02.387136244 +0300
+@@ -10,7 +10,7 @@
+  *
+  *      $RCSfile: CUDAMarchingCubes.cu,v $
+  *      $Author: johns $        $Locker:  $             $State: Exp $
+- *      $Revision: 1.30 $       $Date: 2016/11/28 03:04:58 $
++ *      $Revision: 1.32 $       $Date: 2018/02/15 05:15:02 $
+  *
+  ***************************************************************************
+  * DESCRIPTION:
+@@ -25,14 +25,17 @@
+ //
+ // Description: This class computes an isosurface for a given density grid
+ //              using a CUDA Marching Cubes (MC) alorithm. 
+-//              The implementation is based on the MC demo from the 
+-//              Nvidia GPU Computing SDK, but has been improved 
+-//              and extended.  This implementation achieves higher 
+-//              performance by reducing the number of temporary memory
+-//              buffers, reduces the number of scan calls by using vector
+-//              integer types, and allows extraction of per-vertex normals 
+-//              optionally computes per-vertex colors if provided with a 
+-//              volumetric texture map.
++//
++//              The implementation is loosely based on the MC demo from 
++//              the Nvidia GPU Computing SDK, but the design has been 
++//              improved and extended in several ways.  
++//
++//              This implementation achieves higher performance
++//              by reducing the number of temporary memory
++//              buffers, reduces the number of scan calls by using 
++//              vector integer types, and allows extraction of 
++//              per-vertex normals and optionally computes 
++//              per-vertex colors if a volumetric texture map is provided.
+ //
+ // Author: Michael Krone <michael.krone@visus.uni-stuttgart.de>
+ //         John Stone <johns@ks.uiuc.edu>
+@@ -48,7 +51,7 @@
+ #include <thrust/functional.h>
+ 
+ //
+-// Restrict macro to make it easy to do perf tuning tess
++// Restrict macro to make it easy to do perf tuning tests
+ //
+ #if 0
+ #define RESTRICT __restrict__
+@@ -171,6 +174,11 @@
+ texture<float, 3, cudaReadModeElementType> volumeTex;
+ 
+ // sample volume data set at a point p, p CAN NEVER BE OUT OF BOUNDS
++// XXX The sampleVolume() call underperforms vs. peak memory bandwidth
++//     because we don't strictly enforce coalescing requirements in the
++//     layout of the input volume presently.  If we forced X/Y dims to be
++//     warp-multiple it would become possible to use wider fetches and
++//     a few other tricks to improve global memory bandwidth 
+ __device__ float sampleVolume(const float * RESTRICT data, 
+                               uint3 p, uint3 gridSize) {
+     return data[(p.z*gridSize.x*gridSize.y) + (p.y*gridSize.x) + p.x];
+@@ -592,6 +600,30 @@
+     cudaBindTextureToArray(volumeTex, d_vol, desc);
+ }
+ 
++#if CUDART_VERSION >= 9000
++//
++// XXX CUDA 9.0RC breaks the usability of Thrust scan() prefix sums when
++//     used with the built-in uint2 vector integer types.  To workaround
++//     the problem we have to define our own type and associated conversion
++//     routines etc.
++//
++
++// XXX workaround for uint2 breakage in CUDA 9.0RC
++struct myuint2 : uint2 {
++  __host__ __device__ myuint2() : uint2(make_uint2(0, 0)) {}
++  __host__ __device__ myuint2(int val) : uint2(make_uint2(val, val)) {}
++  __host__ __device__ myuint2(uint2 val) : uint2(make_uint2(val.x, val.y)) {}
++};
++
++void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) {
++    const uint2 zero = make_uint2(0, 0);
++    thrust::exclusive_scan(thrust::device_ptr<myuint2>((myuint2*)input),
++                           thrust::device_ptr<myuint2>((myuint2*)input + numElements),
++                           thrust::device_ptr<myuint2>((myuint2*)output),
++                           (myuint2) zero);
++}
++
++#else
+ 
+ void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) {
+     const uint2 zero = make_uint2(0, 0);
+@@ -601,6 +633,7 @@
+                            zero);
+ }
+ 
++#endif
+ 
+ void ThrustScanWrapperArea(float* output, float* input, unsigned int numElements) {
+     thrust::inclusive_scan(thrust::device_ptr<float>(input), 
+@@ -639,11 +672,9 @@
+ }
+ 
+ 
+-///////////////////////////////////////////////////////////////////////////////
+ //
+ // class CUDAMarchingCubes
+ //
+-///////////////////////////////////////////////////////////////////////////////
+ 
+ CUDAMarchingCubes::CUDAMarchingCubes() {
+     // initialize values
+@@ -713,9 +744,6 @@
+ }
+ 
+ 
+-////////////////////////////////////////////////////////////////////////////////
+-//! Run the Cuda part of the computation
+-////////////////////////////////////////////////////////////////////////////////
+ void CUDAMarchingCubes::computeIsosurfaceVerts(float3* vertOut, unsigned int maxverts, dim3 & grid3) {
+     // check if data is available
+     if (!this->setdata)
+
+--- src/CUDAMDFF.cu	2016-12-01 10:11:56.000000000 +0300
++++ src/CUDAMDFF.cu	2018-03-30 18:56:44.352937599 +0300
+@@ -11,7 +11,7 @@
+  *
+  *      $RCSfile: CUDAMDFF.cu,v $
+  *      $Author: johns $        $Locker:  $             $State: Exp $
+- *      $Revision: 1.75 $      $Date: 2015/04/07 20:41:26 $
++ *      $Revision: 1.78 $      $Date: 2018/02/19 07:10:37 $
+  *
+  ***************************************************************************
+  * DESCRIPTION:
+@@ -28,12 +28,16 @@
+ #include <stdlib.h>
+ #include <string.h>
+ #include <cuda.h>
+-#include <float.h> // FLT_MAX etc
+-
++#if CUDART_VERSION >= 9000
++#include <cuda_fp16.h> // need to explicitly include for CUDA 9.0
++#endif
+ #if CUDART_VERSION < 4000
+ #error The VMD MDFF feature requires CUDA 4.0 or later
+ #endif
+ 
++#include <float.h> // FLT_MAX etc
++
++
+ #include "Inform.h"
+ #include "utilities.h"
+ #include "WKFThreads.h"
+@@ -588,6 +592,43 @@
+ }
+ 
+ 
++
++// #define VMDUSESHUFFLE 1
++#if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000
++// New warp shuffle-based CC sum reduction for Kepler and later GPUs.
++inline __device__ void cc_sumreduction(int tid, int totaltb, 
++                                float4 &total_cc_sums,
++                                float &total_lcc,
++                                int &total_lsize,
++                                float4 *tb_cc_sums,
++                                float *tb_lcc,
++                                int *tb_lsize) {
++  total_cc_sums = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
++  total_lcc = 0.0f;
++  total_lsize = 0;
++
++  // use precisely one warp to do the final reduction
++  if (tid < warpSize) {
++    for (int i=tid; i<totaltb; i+=warpSize) {
++      total_cc_sums += tb_cc_sums[i];
++      total_lcc     += tb_lcc[i];
++      total_lsize   += tb_lsize[i];
++    }
++
++    // perform intra-warp parallel reduction...
++    // general loop version of parallel sum-reduction
++    for (int mask=warpSize/2; mask>0; mask>>=1) {
++      total_cc_sums.x += __shfl_xor_sync(0xffffffff, total_cc_sums.x, mask);
++      total_cc_sums.y += __shfl_xor_sync(0xffffffff, total_cc_sums.y, mask);
++      total_cc_sums.z += __shfl_xor_sync(0xffffffff, total_cc_sums.z, mask);
++      total_cc_sums.w += __shfl_xor_sync(0xffffffff, total_cc_sums.w, mask);
++      total_lcc     += __shfl_xor_sync(0xffffffff, total_lcc, mask);
++      total_lsize   += __shfl_xor_sync(0xffffffff, total_lsize, mask);
++    }
++  }
++}
++#else
++// shared memory based CC sum reduction 
+ inline __device__ void cc_sumreduction(int tid, int totaltb, 
+                                 float4 &total_cc_sums,
+                                 float &total_lcc,
+@@ -629,6 +670,7 @@
+   total_lcc = tb_lcc[0];
+   total_lsize = tb_lsize[0];
+ }
++#endif
+ 
+ 
+ inline __device__ void thread_cc_sum(float ref, float density,
+@@ -750,6 +792,92 @@
+   }
+ 
+ 
++#if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000
++  // all threads write their local sums to shared memory...
++  __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ];
++  __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ];
++  __shared__ float tb_lcc_s[TOTALBLOCKSZ];
++  __shared__ int tb_lsize_s[TOTALBLOCKSZ];
++
++  tb_cc_means_s[tid] = thread_cc_means;
++  tb_cc_squares_s[tid] = thread_cc_squares;
++  tb_lcc_s[tid] = thread_lcc;
++  tb_lsize_s[tid] = thread_lsize;
++  __syncthreads(); // all threads must hit syncthreads call...
++
++  // use precisely one warp to do the thread-block-wide reduction
++  if (tid < warpSize) {
++    float2 tmp_cc_means = make_float2(0.0f, 0.0f);
++    float2 tmp_cc_squares = make_float2(0.0f, 0.0f);
++    float tmp_lcc = 0.0f;
++    int tmp_lsize = 0;
++    for (int i=tid; i<TOTALBLOCKSZ; i+=warpSize) {
++      tmp_cc_means   += tb_cc_means_s[i];
++      tmp_cc_squares += tb_cc_squares_s[i];
++      tmp_lcc        += tb_lcc_s[i];
++      tmp_lsize      += tb_lsize_s[i];
++    }
++
++    // perform intra-warp parallel reduction...
++    // general loop version of parallel sum-reduction
++    for (int mask=warpSize/2; mask>0; mask>>=1) {
++      tmp_cc_means.x   += __shfl_xor_sync(0xffffffff, tmp_cc_means.x, mask);
++      tmp_cc_means.y   += __shfl_xor_sync(0xffffffff, tmp_cc_means.y, mask);
++      tmp_cc_squares.x += __shfl_xor_sync(0xffffffff, tmp_cc_squares.x, mask);
++      tmp_cc_squares.y += __shfl_xor_sync(0xffffffff, tmp_cc_squares.y, mask);
++      tmp_lcc          += __shfl_xor_sync(0xffffffff, tmp_lcc, mask);
++      tmp_lsize        += __shfl_xor_sync(0xffffffff, tmp_lsize, mask);
++    }
++
++    // write per-thread-block partial sums to global memory,
++    // if a per-thread-block CC output array is provided, write the 
++    // local CC for this thread block out, and finally, check if we 
++    // are the last thread block to finish, and finalize the overall
++    // CC results for the entire grid of thread blocks.
++    if (tid == 0) {   
++      unsigned int bid = blockIdx.z * gridDim.x * gridDim.y +
++                         blockIdx.y * gridDim.x + blockIdx.x;
++
++      tb_cc_sums[bid] = make_float4(tmp_cc_means.x, tmp_cc_means.y,
++                                    tmp_cc_squares.x, tmp_cc_squares.y);
++      tb_lcc[bid]     = tmp_lcc;
++      tb_lsize[bid]   = tmp_lsize;
++
++      if (tb_CC != NULL) {
++        float cc = calc_cc(tb_cc_means_s[0].x, tb_cc_means_s[0].y,
++                           tb_cc_squares_s[0].x, tb_cc_squares_s[0].y,
++                           tb_lsize_s[0], tb_lcc_s[0]);
++
++        // write local per-thread-block CC to global memory
++        tb_CC[bid]   = cc;
++      }
++
++      __threadfence();
++
++      unsigned int value = atomicInc(&tbcatomic[0], totaltb);
++      isLastBlockDone = (value == (totaltb - 1));
++    }
++  }
++  __syncthreads();
++
++  if (isLastBlockDone) {
++    float4 total_cc_sums;
++    float total_lcc;
++    int total_lsize;
++    cc_sumreduction(tid, totaltb, total_cc_sums, total_lcc, total_lsize,
++                    tb_cc_sums, tb_lcc, tb_lsize); 
++
++    if (tid == 0) {
++      tb_cc_sums[totaltb] = total_cc_sums;
++      tb_lcc[totaltb] = total_lcc;
++      tb_lsize[totaltb] = total_lsize;
++    }
++ 
++    reset_atomic_counter(&tbcatomic[0]);
++  }
++
++#else
++
+   // all threads write their local sums to shared memory...
+   __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ];
+   __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ];
+@@ -794,6 +922,7 @@
+     }
+     __syncthreads(); // all threads must hit syncthreads call...
+   }
++//#endif
+ 
+   // write per-thread-block partial sums to global memory,
+   // if a per-thread-block CC output array is provided, write the 
+@@ -847,6 +976,7 @@
+     }
+ #endif
+   }
++#endif
+ }
+ 
+ 
+
+--- src/CUDAQuickSurf.cu	2016-12-01 10:11:56.000000000 +0300
++++ src/CUDAQuickSurf.cu	2018-03-30 19:01:38.777196233 +0300
+@@ -11,7 +11,7 @@
+  *
+  *      $RCSfile: CUDAQuickSurf.cu,v $
+  *      $Author: johns $        $Locker:  $             $State: Exp $
+- *      $Revision: 1.81 $      $Date: 2016/04/20 04:57:46 $
++ *      $Revision: 1.84 $      $Date: 2018/02/15 04:59:15 $
+  *
+  ***************************************************************************
+  * DESCRIPTION:
+@@ -22,6 +22,9 @@
+ #include <stdlib.h>
+ #include <string.h>
+ #include <cuda.h>
++#if CUDART_VERSION >= 9000
++#include <cuda_fp16.h>  // need to explicitly include for CUDA 9.0
++#endif
+ 
+ #if CUDART_VERSION < 4000
+ #error The VMD QuickSurf feature requires CUDA 4.0 or later
+@@ -130,14 +133,14 @@
+ #define GUNROLL      1
+ #endif
+ 
+-#if __CUDA_ARCH__ >= 300
+ #define MAXTHRDENS  ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
+-#define MINBLOCKDENS 1
++#if __CUDA_ARCH__ >= 600
++#define MINBLOCKDENS 16
++#elif __CUDA_ARCH__ >= 300
++#define MINBLOCKDENS 16
+ #elif __CUDA_ARCH__ >= 200
+-#define MAXTHRDENS  ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
+ #define MINBLOCKDENS 1
+ #else
+-#define MAXTHRDENS  ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
+ #define MINBLOCKDENS 1
+ #endif
+ 
+@@ -150,7 +153,7 @@
+ //
+ template<class DENSITY, class VOLTEX>
+ __global__ static void 
+-// __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
++__launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
+ gaussdensity_fast_tex_norm(int natoms,
+                       const float4 * RESTRICT sorted_xyzr, 
+                       const float4 * RESTRICT sorted_color, 
+@@ -217,6 +220,8 @@
+     for (yab=yabmin; yab<=yabmax; yab++) {
+       for (xab=xabmin; xab<=xabmax; xab++) {
+         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
++        // this biggest latency hotspot in the kernel, if we could improve
++        // packing of the grid cell map, we'd likely improve performance 
+         uint2 atomstartend = cellStartEnd[abcellidx];
+         if (atomstartend.x != GRID_CELL_EMPTY) {
+           unsigned int atomid;
+@@ -296,7 +301,7 @@
+ 
+ 
+ __global__ static void 
+-// __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
++__launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
+ gaussdensity_fast_tex3f(int natoms,
+                         const float4 * RESTRICT sorted_xyzr, 
+                         const float4 * RESTRICT sorted_color, 
+@@ -363,6 +368,8 @@
+     for (yab=yabmin; yab<=yabmax; yab++) {
+       for (xab=xabmin; xab<=xabmax; xab++) {
+         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
++        // this biggest latency hotspot in the kernel, if we could improve
++        // packing of the grid cell map, we'd likely improve performance 
+         uint2 atomstartend = cellStartEnd[abcellidx];
+         if (atomstartend.x != GRID_CELL_EMPTY) {
+           unsigned int atomid;
+@@ -550,7 +557,6 @@
+ 
+ // per-GPU handle with various memory buffer pointers, etc.
+ typedef struct {
+-  /// max grid sizes and attributes the current allocations will support
+   int verbose;
+   long int natoms;
+   int colorperatom;
+@@ -561,18 +567,18 @@
+   int gy;
+   int gz;
+ 
+-  CUDAMarchingCubes *mc;     ///< Marching cubes class used to extract surface
++  CUDAMarchingCubes *mc;     
+ 
+-  float *devdensity;         ///< density map stored in GPU memory
+-  void *devvoltexmap;        ///< volumetric texture map
+-  float4 *xyzr_d;            ///< atom coords and radii
+-  float4 *sorted_xyzr_d;     ///< cell-sorted coords and radii
+-  float4 *color_d;           ///< colors
+-  float4 *sorted_color_d;    ///< cell-sorted colors
+-
+-  unsigned int *atomIndex_d; ///< cell index for each atom
+-  unsigned int *atomHash_d;  ///<  
+-  uint2 *cellStartEnd_d;     ///< cell start/end indices 
++  float *devdensity;         
++  void *devvoltexmap;        
++  float4 *xyzr_d;            
++  float4 *sorted_xyzr_d;     
++  float4 *color_d;           
++  float4 *sorted_color_d;    
++
++  unsigned int *atomIndex_d; 
++  unsigned int *atomHash_d;  
++  uint2 *cellStartEnd_d;     
+ 
+   void *safety;
+   float3 *v3f_d;
diff --git a/mpi.patch b/mpi.patch
new file mode 100644
index 000000000000..b3822a31f665
--- /dev/null
+++ b/mpi.patch
@@ -0,0 +1,26 @@
+--- configure	2019-05-24 15:48:39.590316691 +0300
++++ configure	2019-05-24 15:48:07.136424874 +0300
+@@ -970,15 +970,15 @@
+ #######################
+ $mpi_defines     = "-DVMDMPI ";
+ ## Argonne MPICH
+-$mpi_dir         = "/usr/lib64/mpich";
+-$mpi_include     = "-I/usr/include/mpich-x86_64";
+-$mpi_library     = "-L$mpi_dir/lib";
+-$mpi_libs        = "-lmpich";
++#$mpi_dir         = "/usr";
++#$mpi_include     = "-I/usr/include";
++#$mpi_library     = "-L$mpi_dir/lib";
++#$mpi_libs        = "-lmpicxx -lmpi";
+ ## OpenMPI
+-# $mpi_dir         = "/usr/lib64/openmpi";
+-# $mpi_include     = "-I/usr/include/openmpi-x86_64";
+-# $mpi_library     = "-L$mpi_dir/lib";
+-# $mpi_libs        = "-lmpi";
++$mpi_dir         = "/usr";
++$mpi_include     = "-I/usr/include";
++$mpi_library     = "-L$mpi_dir/lib/openmpi";
++$mpi_libs        = "-lmpi_cxx -lmpi";
+ ## OSU MVAPICH
+ # $mpi_dir         = "/usr/mpi/gcc/mvapich-1.0.0";
+ # $mpi_include     = "-I$mpi_dir/include";
diff --git a/ospray.patch b/ospray.patch
new file mode 100644
index 000000000000..f825563d8f46
--- /dev/null
+++ b/ospray.patch
@@ -0,0 +1,53 @@
+--- src/OSPRayRenderer.C	2016-12-01 10:11:51.000000000 +0300
++++ src/OSPRayRenderer.C	2018-03-30 18:22:52.389958174 +0300
+@@ -11,7 +11,7 @@
+ *
+ *      $RCSfile: OSPRayRenderer.C
+ *      $Author: johns $      $Locker:  $               $State: Exp $
+-*      $Revision: 1.60 $         $Date: 2016/11/28 06:00:48 $
++*      $Revision: 1.61 $         $Date: 2017/01/10 13:57:50 $
+ *
+ ***************************************************************************
+ * DESCRIPTION:
+@@ -659,7 +659,7 @@
+   if (ao_samples != 0)
+     lightscale = ao_direct;
+ 
+-  for (int i = 0; i < directional_lights.num(); ++i) {
++  for (i = 0; i < directional_lights.num(); ++i) {
+ #if 1
+     OSPLight light = ospNewLight(ospRenderer, "distant");
+ #else
+@@ -2021,7 +2021,7 @@
+   ca.cylinders = (float *) calloc(1, cylnum * bytes_per_cylinder);
+   ca.colors = (float *) calloc(1, cylnum * 4 * sizeof(float));
+ 
+-  unsigned int i,ind4,ind6,ind7;
++  int i,ind4,ind6,ind7;
+   const int rOffset = 6; // radius offset
+   if (wtrans == NULL) {
+     for (i=0,ind4=0,ind6=0,ind7=0; i<cylnum; i++,ind4+=4,ind6+=6,ind7+=7) {
+@@ -2469,10 +2469,7 @@
+   int i, ind, ind9, ind12;
+ 
+   const float ci2f = 1.0f / 255.0f;
+-  const float cn2f = 1.0f / 127.5f;
+   for (i=0,ind=0,ind9=0,ind12=0; i<numfacets; i++,ind+=3,ind9+=9,ind12+=12) {
+-    float norm[9];
+-
+     // transform to eye coordinates
+     wtrans.multpoint3d(v + ind9    , (float*) &mesh.v[ind9    ]);
+     wtrans.multpoint3d(v + ind9 + 3, (float*) &mesh.v[ind9 + 3]);
+@@ -2627,12 +2624,6 @@
+   // create and fill the OSPRay trimesh memory buffer
+   int i, ind, ind9, ind12;
+ 
+-  const rgba c = { uniform_color[0], 
+-                   uniform_color[1], 
+-                   uniform_color[2], 
+-                   1.0f /* mat_opacity*/ };
+-
+-
+   for (i=0,ind=0,ind9=0,ind12=0; i<numfacets; i++,ind+=3,ind9+=9,ind12+=12) {
+     // transform to eye coordinates
+     wtrans.multpoint3d(v + ind9    , (float*) &mesh.v[ind9    ]);