Updated to the latest alpha

author: Anton Kudelin 2020-01-23 15:28:45 +0300
committer: Anton Kudelin 2020-01-23 15:28:45 +0300
commit: b76e49a197f9ca85917562e7126903efef435a8c (patch)
tree: c7ece2e247f0b878645e599aa0ed862ac39f5125
parent: b5bc00e19b30d6afada54c62ed505af6851c1551 (diff)
download: aur-b76e49a197f9ca85917562e7126903efef435a8c.tar.gz
5 files changed, 47 insertions, 533 deletions
diff --git a/.SRCINFO b/.SRCINFO
index 8651a9da13da..a52081318d41 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -1,7 +1,7 @@
 pkgbase = vmd-src
 	pkgdesc = Visual Molecular Dynamics
-	pkgver = 1.9.3
-	pkgrel = 2
+	pkgver = 1.9.4a38
+	pkgrel = 1
 	url = http://www.ks.uiuc.edu/Research/vmd/
 	arch = x86_64
 	license = custom
@@ -21,15 +21,11 @@ pkgbase = vmd-src
 	optdepends = cuda: NVIDIA CUDA GPU acceleration functions
 	conflicts = vmd
 	conflicts = vmd-bin
-	source = local://vmd-1.9.3.src.tar.gz
+	source = local://vmd-1.9.4a38.src.tar.gz
 	source = configure.patch
-	source = ospray.patch
-	source = cuda.patch
 	source = mpi.patch
-	sha256sums = 5bdc314dc836d620fe510ed4b6c3dbe3cf66525b61680ffec4e2563cf495f128
-	sha256sums = c25475d51be75c2b9e3bcd5f8d5d3ed1fd0df992f9c8020b0346c49f6c40b275
-	sha256sums = 09f1cf35ebd302095a68d2b5db03e1671493411aed3f839baf2b1f44dc7e0e0d
-	sha256sums = 8880006bd97f7397d5a5592aaeeb6e80530d8c6026280c1b961e43f568e12daa
+	sha256sums = 5ae1448c59369f99d0c9408b0acbc08a55bea9fa69f9b25fccc565ddac671fa9
+	sha256sums = 96ce10c2d9728b416dbac83bfc30d84248044ea7371de7bbfbe854ba7e0adef5
 	sha256sums = e281a57831b8ff60c5a644219f0b6289d32bee239978af676474941c7d8548c0
 
 pkgname = vmd-src
diff --git a/PKGBUILD b/PKGBUILD
index b3c83546bca2..b8313040abd8 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -5,8 +5,8 @@
 
 pkgname=vmd-src
 _pkgname=vmd
-pkgver=1.9.3
-pkgrel=2
+pkgver=1.9.4a38
+pkgrel=1
 pkgdesc="Visual Molecular Dynamics"
 url="http://www.ks.uiuc.edu/Research/vmd/"
 license=('custom')
@@ -19,16 +19,13 @@ optdepends=('openbabel: additional file formats support'
             'optix: accelerated ray tracing for NVIDIA GPUs'
             'cuda: NVIDIA CUDA GPU acceleration functions')
 conflicts=("$_pkgname" "$_pkgname-bin")
-# You have to download the package from the VMD url and put it in the PKGBUILD folder.
+# You have to download the package from the VMD url
+# and put it in the PKGBUILD folder.
 source=("local://$_pkgname-${pkgver}.src.tar.gz"
         "configure.patch"
-        "ospray.patch"
-        "cuda.patch"
         "mpi.patch")
-sha256sums=('5bdc314dc836d620fe510ed4b6c3dbe3cf66525b61680ffec4e2563cf495f128'
-            'c25475d51be75c2b9e3bcd5f8d5d3ed1fd0df992f9c8020b0346c49f6c40b275'
-            '09f1cf35ebd302095a68d2b5db03e1671493411aed3f839baf2b1f44dc7e0e0d'
-            '8880006bd97f7397d5a5592aaeeb6e80530d8c6026280c1b961e43f568e12daa'
+sha256sums=('5ae1448c59369f99d0c9408b0acbc08a55bea9fa69f9b25fccc565ddac671fa9'
+            '96ce10c2d9728b416dbac83bfc30d84248044ea7371de7bbfbe854ba7e0adef5'
             'e281a57831b8ff60c5a644219f0b6289d32bee239978af676474941c7d8548c0')
 
 prepare() {
@@ -37,8 +34,6 @@ prepare() {
   mkdir plugins
   sed -i 's#:${LD_LIBRARY_PATH}/:${LD_LIBRARY_PATH}:#/opt/optix/lib64#g' bin/*
   patch -p0 < ../configure.patch
-  patch -p0 < ../ospray.patch
-  patch -p0 < ../cuda.patch
   
   # Assuming openmpi; if not the case edit mpi.patch
   patch -p0 < ../mpi.patch
@@ -60,7 +55,7 @@ build() {
   make -j1 LINUXAMD64 world
   make distrib
   cd ../$_pkgname-$pkgver
-  ./configure LINUXAMD64 OPENGL EGLPBUFFER FLTKOPENGL FLTK TK $ACC IMD OPENCL MPI XINERAMA XINPUT LIBOSPRAY LIBPNG NETCDF COLVARS TCL PYTHON PTHREADS GCC
+  ./configure LINUXAMD64 OPENGL EGLPBUFFER FLTKOPENGL FLTK TK $ACC IMD OPENCL MPI XINERAMA XINPUT LIBOSPRAY LIBPNG ZLIB NETCDF COLVARS TCL PYTHON PTHREADS GCC
   cd src
   make veryclean
   make
diff --git a/configure.patch b/configure.patch
index ad6dd0acb5e3..62d58821acf0 100644
--- a/configure.patch
+++ b/configure.patch
@@ -1,15 +1,15 @@
---- configure	2016-12-01 10:11:33.000000000 +0300
-+++ configure	2019-05-24 15:40:14.742991210 +0300
-@@ -466,7 +466,7 @@
+--- configure	2020-01-23 14:40:20.000000000 +0300
++++ configure	2020-01-23 14:56:53.679636518 +0300
+@@ -483,7 +483,7 @@
  
  $arch_cc          = "cc";
  $arch_ccpp        = "CC";
--$arch_nvcc        = "/usr/local/cuda-8.0/bin/nvcc";
+-$arch_nvcc        = "/usr/local/cuda-10.0/bin/nvcc";
 +$arch_nvcc        = "nvcc";
- $arch_nvccflags   = "--ptxas-options=-v " . 
+ $arch_nvccflags   = "-lineinfo --ptxas-options=-v " . 
                      "-gencode arch=compute_30,code=compute_30 " .
                      "-gencode arch=compute_30,code=sm_35 " .
-@@ -729,8 +729,8 @@
+@@ -767,8 +767,8 @@
  if ($config_tk) { $tcl_include .= " -I$stock_tk_include_dir"; }
  $tcl_library      = "-L$stock_tcl_library_dir";
  if ($config_tk) { $tcl_library .= " -L$stock_tk_library_dir"; }
@@ -20,39 +20,41 @@
  
  @tcl_cc           = ();
  @tcl_cu           = ();
-@@ -888,11 +888,7 @@
+@@ -895,13 +895,7 @@
  $system_dir           = "";
  $system_include       = "-I.";
  $system_library       = "";
 -if ( $config_gcc ) {
--  $system_libs        = "-ll -lm";
+-  ## Note: some old commercial versions of Unix require -ll for lex internals
+-  ## now that we've been using flex for years, -ll should be moot.
+-  $system_libs        = "-lm";
 -} else {
 -  $system_libs        = "-lm";
 -}
-+$system_libs        = "-lrt -lm";
++$system_libs          = "-lrt -lm";
  @system_cc             = ();
  @system_cu             = ();
  @system_ccpp           = ();
-@@ -922,7 +918,7 @@
+@@ -931,7 +925,7 @@
  #   This option enables the use of CUDA GPU acceleration functions.
  #######################
  $cuda_defines     = "-DVMDCUDA -DMSMPOT_CUDA";
--$cuda_dir         = "/usr/local/cuda-8.0";
+-$cuda_dir         = "/usr/local/cuda-10.0";
 +$cuda_dir         = "/opt/cuda";
  $cuda_include     = "";
  $cuda_library     = "";
- $cuda_libs        = "-Wl,-rpath -Wl,\$\$ORIGIN/ -lcudart_static";
-@@ -1042,7 +1038,7 @@
- # $liboptix_dir         = "/usr/local/encap/NVIDIA-OptiX-SDK-3.6.3-linux64";
- # $liboptix_dir         = "/usr/local/encap/NVIDIA-OptiX-SDK-3.8.0-linux64";
- # $liboptix_dir         = "/usr/local/encap/NVIDIA-OptiX-SDK-3.9.0-linux64";
--$liboptix_dir         = "/usr/local/encap/NVIDIA-OptiX-SDK-4.0.1-linux64";
+ $cuda_libs        = "-Wl,-rpath -Wl,\$\$ORIGIN/ -lcudart_static -lrt";
+@@ -1098,7 +1092,7 @@
+ # $liboptix_dir         = "/usr/local/encap/NVIDIA-OptiX-SDK-4.1.1-linux64";
+ # $liboptix_dir         = "/usr/local/encap/NVIDIA-OptiX-SDK-5.0.0-linux64";
+ # $liboptix_dir         = "/usr/local/encap/NVIDIA-OptiX-SDK-5.0.1-linux64";
+-$liboptix_dir         = "/usr/local/encap/NVIDIA-OptiX-SDK-5.1.0-linux64";
 +$liboptix_dir         = "/opt/optix";
+ # $liboptix_dir         = "/usr/local/encap/NVIDIA-OptiX-SDK-6.0.0-linux64";
+ 
  # NCSA Blue Waters
- # $liboptix_dir         = "/u/sciteam/stonej/local/NVIDIA-OptiX-SDK-3.6.3-linux64";
- # $liboptix_dir         = "/u/sciteam/stonej/local/NVIDIA-OptiX-SDK-3.8.0-linux64";
-@@ -1082,7 +1078,7 @@
- $libospray_dir         = "/usr/local/ospray-1.1.1.x86_64.linux";
+@@ -1148,7 +1142,7 @@
+ $libospray_dir         = "/usr/local/ospray-1.7.0.x86_64.linux";
  $libospray_include     = "-I$libospray_dir/include -I$libospray_dir/ospray/include ";
  $libospray_library     = "-L$libospray_dir/lib ";
 -$libospray_libs        = "-lospray -lospray_common -lembree -ltbb -ltbbmalloc ";
@@ -60,7 +62,7 @@
  # Both OptiX and OSPRay renderers use the Tachyon glwin code, 
  # so we have to make sure we don't build/link it twice
  if ($config_liboptix) {
-@@ -1375,20 +1371,20 @@
+@@ -1459,20 +1453,20 @@
  ###################
  # location of Python library and include file.
  # If left blank, standard system  directories will be searched.
@@ -69,15 +71,15 @@
  #$stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "/usr/local/lib";
 -$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "$vmd_library_dir/python/lib_$config_arch/include/python2.5";
 -$stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.5/config";
-+#$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "$vmd_library_dir/python/lib_$config_arch/include/python2.7";
-+#$stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.7/config";
++#$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "$vmd_library_dir/python/lib_$config_arch/include/python2.5";
++#$stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.5/config";
  
  #$stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "/usr/local/include";
  #$stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "/usr/local/lib";
 -$stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "$vmd_library_dir/numpy/lib_$config_arch/include";
 -$stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.5/site-packages/numpy/core/include";
 +#$stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "$vmd_library_dir/numpy/lib_$config_arch/include";
-+#$stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.7/site-packages/numpy/core/include";
++#$stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.5/site-packages/numpy/core/include";
  
  $python_defines     = "-DVMDPYTHON";
  $python_include     = "-I$stock_python_include_dir -I$stock_numpy_include_dir -I$stock_numpy_library_dir";
@@ -87,23 +89,23 @@
  @python_h           = ('PythonTextInterp.h',
                         'VMDTkinterMenu.h',
  		       'py_commands.h',
-@@ -2278,9 +2274,9 @@
+@@ -2368,9 +2362,9 @@
        # for compiling with g++:
        $arch_cc		= "gcc";
        $arch_ccpp  	= "g++";
--      $arch_opt_flag	= "-m64 -fno-for-scope -Wno-deprecated -Wall -O6 -ffast-math";
-+      $arch_opt_flag	= "-m64 -fno-for-scope -Wno-deprecated -Wall -O3 -ffast-math";
+-      $arch_opt_flag	= "-m64 -fno-for-scope -Wno-deprecated -Wall -Wno-unknown-pragmas -O6 -ffast-math";
++      $arch_opt_flag	= "-Ofast -march=native";
        $arch_depend_flag   = "-MM";
--      $arch_copts	        = "-m64 -Wall -O6 -ffast-math";
-+      $arch_copts	        = "-m64 -Wall -O3 -ffast-math";
-       $arch_template_repository = "foobar";
+-      $arch_copts       = "-m64 -Wall -Wno-unknown-pragmas -O6 -ffast-math";
++      $arch_copts       = "-Ofast -march=native";
  
        # so far only STATIC version tested
-@@ -2293,7 +2289,7 @@
+       if ($config_static) {
+@@ -2382,7 +2376,7 @@
  
      if ($config_cuda) {
        $arch_nvccflags   .= " --machine 64 -O3 $cuda_include";
--      $cuda_library     = "-L/usr/local/cuda-8.0/lib64";
+-      $cuda_library     = "-L/usr/local/cuda-10.0/lib64";
 +      $cuda_library     = "-L/opt/cuda/lib64";
      }
  
diff --git a/cuda.patch b/cuda.patch
deleted file mode 100644
index 770360ec8d94..000000000000
--- a/cuda.patch
+++ /dev/null
@@ -1,426 +0,0 @@
---- src/CUDAMarchingCubes.cu	2018-03-30 18:52:25.467189457 +0300
-+++ src/CUDAMarchingCubes.cu	2018-03-30 18:52:02.387136244 +0300
-@@ -10,7 +10,7 @@
-  *
-  *      $RCSfile: CUDAMarchingCubes.cu,v $
-  *      $Author: johns $        $Locker:  $             $State: Exp $
-- *      $Revision: 1.30 $       $Date: 2016/11/28 03:04:58 $
-+ *      $Revision: 1.32 $       $Date: 2018/02/15 05:15:02 $
-  *
-  ***************************************************************************
-  * DESCRIPTION:
-@@ -25,14 +25,17 @@
- //
- // Description: This class computes an isosurface for a given density grid
- //              using a CUDA Marching Cubes (MC) alorithm. 
--//              The implementation is based on the MC demo from the 
--//              Nvidia GPU Computing SDK, but has been improved 
--//              and extended.  This implementation achieves higher 
--//              performance by reducing the number of temporary memory
--//              buffers, reduces the number of scan calls by using vector
--//              integer types, and allows extraction of per-vertex normals 
--//              optionally computes per-vertex colors if provided with a 
--//              volumetric texture map.
-+//
-+//              The implementation is loosely based on the MC demo from 
-+//              the Nvidia GPU Computing SDK, but the design has been 
-+//              improved and extended in several ways.  
-+//
-+//              This implementation achieves higher performance
-+//              by reducing the number of temporary memory
-+//              buffers, reduces the number of scan calls by using 
-+//              vector integer types, and allows extraction of 
-+//              per-vertex normals and optionally computes 
-+//              per-vertex colors if a volumetric texture map is provided.
- //
- // Author: Michael Krone <michael.krone@visus.uni-stuttgart.de>
- //         John Stone <johns@ks.uiuc.edu>
-@@ -48,7 +51,7 @@
- #include <thrust/functional.h>
- 
- //
--// Restrict macro to make it easy to do perf tuning tess
-+// Restrict macro to make it easy to do perf tuning tests
- //
- #if 0
- #define RESTRICT __restrict__
-@@ -171,6 +174,11 @@
- texture<float, 3, cudaReadModeElementType> volumeTex;
- 
- // sample volume data set at a point p, p CAN NEVER BE OUT OF BOUNDS
-+// XXX The sampleVolume() call underperforms vs. peak memory bandwidth
-+//     because we don't strictly enforce coalescing requirements in the
-+//     layout of the input volume presently.  If we forced X/Y dims to be
-+//     warp-multiple it would become possible to use wider fetches and
-+//     a few other tricks to improve global memory bandwidth 
- __device__ float sampleVolume(const float * RESTRICT data, 
-                               uint3 p, uint3 gridSize) {
-     return data[(p.z*gridSize.x*gridSize.y) + (p.y*gridSize.x) + p.x];
-@@ -592,6 +600,30 @@
-     cudaBindTextureToArray(volumeTex, d_vol, desc);
- }
- 
-+#if CUDART_VERSION >= 9000
-+//
-+// XXX CUDA 9.0RC breaks the usability of Thrust scan() prefix sums when
-+//     used with the built-in uint2 vector integer types.  To workaround
-+//     the problem we have to define our own type and associated conversion
-+//     routines etc.
-+//
-+
-+// XXX workaround for uint2 breakage in CUDA 9.0RC
-+struct myuint2 : uint2 {
-+  __host__ __device__ myuint2() : uint2(make_uint2(0, 0)) {}
-+  __host__ __device__ myuint2(int val) : uint2(make_uint2(val, val)) {}
-+  __host__ __device__ myuint2(uint2 val) : uint2(make_uint2(val.x, val.y)) {}
-+};
-+
-+void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) {
-+    const uint2 zero = make_uint2(0, 0);
-+    thrust::exclusive_scan(thrust::device_ptr<myuint2>((myuint2*)input),
-+                           thrust::device_ptr<myuint2>((myuint2*)input + numElements),
-+                           thrust::device_ptr<myuint2>((myuint2*)output),
-+                           (myuint2) zero);
-+}
-+
-+#else
- 
- void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) {
-     const uint2 zero = make_uint2(0, 0);
-@@ -601,6 +633,7 @@
-                            zero);
- }
- 
-+#endif
- 
- void ThrustScanWrapperArea(float* output, float* input, unsigned int numElements) {
-     thrust::inclusive_scan(thrust::device_ptr<float>(input), 
-@@ -639,11 +672,9 @@
- }
- 
- 
--///////////////////////////////////////////////////////////////////////////////
- //
- // class CUDAMarchingCubes
- //
--///////////////////////////////////////////////////////////////////////////////
- 
- CUDAMarchingCubes::CUDAMarchingCubes() {
-     // initialize values
-@@ -713,9 +744,6 @@
- }
- 
- 
--////////////////////////////////////////////////////////////////////////////////
--//! Run the Cuda part of the computation
--////////////////////////////////////////////////////////////////////////////////
- void CUDAMarchingCubes::computeIsosurfaceVerts(float3* vertOut, unsigned int maxverts, dim3 & grid3) {
-     // check if data is available
-     if (!this->setdata)
-
---- src/CUDAMDFF.cu	2016-12-01 10:11:56.000000000 +0300
-+++ src/CUDAMDFF.cu	2018-03-30 18:56:44.352937599 +0300
-@@ -11,7 +11,7 @@
-  *
-  *      $RCSfile: CUDAMDFF.cu,v $
-  *      $Author: johns $        $Locker:  $             $State: Exp $
-- *      $Revision: 1.75 $      $Date: 2015/04/07 20:41:26 $
-+ *      $Revision: 1.78 $      $Date: 2018/02/19 07:10:37 $
-  *
-  ***************************************************************************
-  * DESCRIPTION:
-@@ -28,12 +28,16 @@
- #include <stdlib.h>
- #include <string.h>
- #include <cuda.h>
--#include <float.h> // FLT_MAX etc
--
-+#if CUDART_VERSION >= 9000
-+#include <cuda_fp16.h> // need to explicitly include for CUDA 9.0
-+#endif
- #if CUDART_VERSION < 4000
- #error The VMD MDFF feature requires CUDA 4.0 or later
- #endif
- 
-+#include <float.h> // FLT_MAX etc
-+
-+
- #include "Inform.h"
- #include "utilities.h"
- #include "WKFThreads.h"
-@@ -588,6 +592,43 @@
- }
- 
- 
-+
-+// #define VMDUSESHUFFLE 1
-+#if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000
-+// New warp shuffle-based CC sum reduction for Kepler and later GPUs.
-+inline __device__ void cc_sumreduction(int tid, int totaltb, 
-+                                float4 &total_cc_sums,
-+                                float &total_lcc,
-+                                int &total_lsize,
-+                                float4 *tb_cc_sums,
-+                                float *tb_lcc,
-+                                int *tb_lsize) {
-+  total_cc_sums = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-+  total_lcc = 0.0f;
-+  total_lsize = 0;
-+
-+  // use precisely one warp to do the final reduction
-+  if (tid < warpSize) {
-+    for (int i=tid; i<totaltb; i+=warpSize) {
-+      total_cc_sums += tb_cc_sums[i];
-+      total_lcc     += tb_lcc[i];
-+      total_lsize   += tb_lsize[i];
-+    }
-+
-+    // perform intra-warp parallel reduction...
-+    // general loop version of parallel sum-reduction
-+    for (int mask=warpSize/2; mask>0; mask>>=1) {
-+      total_cc_sums.x += __shfl_xor_sync(0xffffffff, total_cc_sums.x, mask);
-+      total_cc_sums.y += __shfl_xor_sync(0xffffffff, total_cc_sums.y, mask);
-+      total_cc_sums.z += __shfl_xor_sync(0xffffffff, total_cc_sums.z, mask);
-+      total_cc_sums.w += __shfl_xor_sync(0xffffffff, total_cc_sums.w, mask);
-+      total_lcc     += __shfl_xor_sync(0xffffffff, total_lcc, mask);
-+      total_lsize   += __shfl_xor_sync(0xffffffff, total_lsize, mask);
-+    }
-+  }
-+}
-+#else
-+// shared memory based CC sum reduction 
- inline __device__ void cc_sumreduction(int tid, int totaltb, 
-                                 float4 &total_cc_sums,
-                                 float &total_lcc,
-@@ -629,6 +670,7 @@
-   total_lcc = tb_lcc[0];
-   total_lsize = tb_lsize[0];
- }
-+#endif
- 
- 
- inline __device__ void thread_cc_sum(float ref, float density,
-@@ -750,6 +792,92 @@
-   }
- 
- 
-+#if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000
-+  // all threads write their local sums to shared memory...
-+  __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ];
-+  __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ];
-+  __shared__ float tb_lcc_s[TOTALBLOCKSZ];
-+  __shared__ int tb_lsize_s[TOTALBLOCKSZ];
-+
-+  tb_cc_means_s[tid] = thread_cc_means;
-+  tb_cc_squares_s[tid] = thread_cc_squares;
-+  tb_lcc_s[tid] = thread_lcc;
-+  tb_lsize_s[tid] = thread_lsize;
-+  __syncthreads(); // all threads must hit syncthreads call...
-+
-+  // use precisely one warp to do the thread-block-wide reduction
-+  if (tid < warpSize) {
-+    float2 tmp_cc_means = make_float2(0.0f, 0.0f);
-+    float2 tmp_cc_squares = make_float2(0.0f, 0.0f);
-+    float tmp_lcc = 0.0f;
-+    int tmp_lsize = 0;
-+    for (int i=tid; i<TOTALBLOCKSZ; i+=warpSize) {
-+      tmp_cc_means   += tb_cc_means_s[i];
-+      tmp_cc_squares += tb_cc_squares_s[i];
-+      tmp_lcc        += tb_lcc_s[i];
-+      tmp_lsize      += tb_lsize_s[i];
-+    }
-+
-+    // perform intra-warp parallel reduction...
-+    // general loop version of parallel sum-reduction
-+    for (int mask=warpSize/2; mask>0; mask>>=1) {
-+      tmp_cc_means.x   += __shfl_xor_sync(0xffffffff, tmp_cc_means.x, mask);
-+      tmp_cc_means.y   += __shfl_xor_sync(0xffffffff, tmp_cc_means.y, mask);
-+      tmp_cc_squares.x += __shfl_xor_sync(0xffffffff, tmp_cc_squares.x, mask);
-+      tmp_cc_squares.y += __shfl_xor_sync(0xffffffff, tmp_cc_squares.y, mask);
-+      tmp_lcc          += __shfl_xor_sync(0xffffffff, tmp_lcc, mask);
-+      tmp_lsize        += __shfl_xor_sync(0xffffffff, tmp_lsize, mask);
-+    }
-+
-+    // write per-thread-block partial sums to global memory,
-+    // if a per-thread-block CC output array is provided, write the 
-+    // local CC for this thread block out, and finally, check if we 
-+    // are the last thread block to finish, and finalize the overall
-+    // CC results for the entire grid of thread blocks.
-+    if (tid == 0) {   
-+      unsigned int bid = blockIdx.z * gridDim.x * gridDim.y +
-+                         blockIdx.y * gridDim.x + blockIdx.x;
-+
-+      tb_cc_sums[bid] = make_float4(tmp_cc_means.x, tmp_cc_means.y,
-+                                    tmp_cc_squares.x, tmp_cc_squares.y);
-+      tb_lcc[bid]     = tmp_lcc;
-+      tb_lsize[bid]   = tmp_lsize;
-+
-+      if (tb_CC != NULL) {
-+        float cc = calc_cc(tb_cc_means_s[0].x, tb_cc_means_s[0].y,
-+                           tb_cc_squares_s[0].x, tb_cc_squares_s[0].y,
-+                           tb_lsize_s[0], tb_lcc_s[0]);
-+
-+        // write local per-thread-block CC to global memory
-+        tb_CC[bid]   = cc;
-+      }
-+
-+      __threadfence();
-+
-+      unsigned int value = atomicInc(&tbcatomic[0], totaltb);
-+      isLastBlockDone = (value == (totaltb - 1));
-+    }
-+  }
-+  __syncthreads();
-+
-+  if (isLastBlockDone) {
-+    float4 total_cc_sums;
-+    float total_lcc;
-+    int total_lsize;
-+    cc_sumreduction(tid, totaltb, total_cc_sums, total_lcc, total_lsize,
-+                    tb_cc_sums, tb_lcc, tb_lsize); 
-+
-+    if (tid == 0) {
-+      tb_cc_sums[totaltb] = total_cc_sums;
-+      tb_lcc[totaltb] = total_lcc;
-+      tb_lsize[totaltb] = total_lsize;
-+    }
-+ 
-+    reset_atomic_counter(&tbcatomic[0]);
-+  }
-+
-+#else
-+
-   // all threads write their local sums to shared memory...
-   __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ];
-   __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ];
-@@ -794,6 +922,7 @@
-     }
-     __syncthreads(); // all threads must hit syncthreads call...
-   }
-+//#endif
- 
-   // write per-thread-block partial sums to global memory,
-   // if a per-thread-block CC output array is provided, write the 
-@@ -847,6 +976,7 @@
-     }
- #endif
-   }
-+#endif
- }
- 
- 
-
---- src/CUDAQuickSurf.cu	2016-12-01 10:11:56.000000000 +0300
-+++ src/CUDAQuickSurf.cu	2018-03-30 19:01:38.777196233 +0300
-@@ -11,7 +11,7 @@
-  *
-  *      $RCSfile: CUDAQuickSurf.cu,v $
-  *      $Author: johns $        $Locker:  $             $State: Exp $
-- *      $Revision: 1.81 $      $Date: 2016/04/20 04:57:46 $
-+ *      $Revision: 1.84 $      $Date: 2018/02/15 04:59:15 $
-  *
-  ***************************************************************************
-  * DESCRIPTION:
-@@ -22,6 +22,9 @@
- #include <stdlib.h>
- #include <string.h>
- #include <cuda.h>
-+#if CUDART_VERSION >= 9000
-+#include <cuda_fp16.h>  // need to explicitly include for CUDA 9.0
-+#endif
- 
- #if CUDART_VERSION < 4000
- #error The VMD QuickSurf feature requires CUDA 4.0 or later
-@@ -130,14 +133,14 @@
- #define GUNROLL      1
- #endif
- 
--#if __CUDA_ARCH__ >= 300
- #define MAXTHRDENS  ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
--#define MINBLOCKDENS 1
-+#if __CUDA_ARCH__ >= 600
-+#define MINBLOCKDENS 16
-+#elif __CUDA_ARCH__ >= 300
-+#define MINBLOCKDENS 16
- #elif __CUDA_ARCH__ >= 200
--#define MAXTHRDENS  ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
- #define MINBLOCKDENS 1
- #else
--#define MAXTHRDENS  ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
- #define MINBLOCKDENS 1
- #endif
- 
-@@ -150,7 +153,7 @@
- //
- template<class DENSITY, class VOLTEX>
- __global__ static void 
--// __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
-+__launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
- gaussdensity_fast_tex_norm(int natoms,
-                       const float4 * RESTRICT sorted_xyzr, 
-                       const float4 * RESTRICT sorted_color, 
-@@ -217,6 +220,8 @@
-     for (yab=yabmin; yab<=yabmax; yab++) {
-       for (xab=xabmin; xab<=xabmax; xab++) {
-         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
-+        // this biggest latency hotspot in the kernel, if we could improve
-+        // packing of the grid cell map, we'd likely improve performance 
-         uint2 atomstartend = cellStartEnd[abcellidx];
-         if (atomstartend.x != GRID_CELL_EMPTY) {
-           unsigned int atomid;
-@@ -296,7 +301,7 @@
- 
- 
- __global__ static void 
--// __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
-+__launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
- gaussdensity_fast_tex3f(int natoms,
-                         const float4 * RESTRICT sorted_xyzr, 
-                         const float4 * RESTRICT sorted_color, 
-@@ -363,6 +368,8 @@
-     for (yab=yabmin; yab<=yabmax; yab++) {
-       for (xab=xabmin; xab<=xabmax; xab++) {
-         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
-+        // this biggest latency hotspot in the kernel, if we could improve
-+        // packing of the grid cell map, we'd likely improve performance 
-         uint2 atomstartend = cellStartEnd[abcellidx];
-         if (atomstartend.x != GRID_CELL_EMPTY) {
-           unsigned int atomid;
-@@ -550,7 +557,6 @@
- 
- // per-GPU handle with various memory buffer pointers, etc.
- typedef struct {
--  /// max grid sizes and attributes the current allocations will support
-   int verbose;
-   long int natoms;
-   int colorperatom;
-@@ -561,18 +567,18 @@
-   int gy;
-   int gz;
- 
--  CUDAMarchingCubes *mc;     ///< Marching cubes class used to extract surface
-+  CUDAMarchingCubes *mc;     
- 
--  float *devdensity;         ///< density map stored in GPU memory
--  void *devvoltexmap;        ///< volumetric texture map
--  float4 *xyzr_d;            ///< atom coords and radii
--  float4 *sorted_xyzr_d;     ///< cell-sorted coords and radii
--  float4 *color_d;           ///< colors
--  float4 *sorted_color_d;    ///< cell-sorted colors
--
--  unsigned int *atomIndex_d; ///< cell index for each atom
--  unsigned int *atomHash_d;  ///<  
--  uint2 *cellStartEnd_d;     ///< cell start/end indices 
-+  float *devdensity;         
-+  void *devvoltexmap;        
-+  float4 *xyzr_d;            
-+  float4 *sorted_xyzr_d;     
-+  float4 *color_d;           
-+  float4 *sorted_color_d;    
-+
-+  unsigned int *atomIndex_d; 
-+  unsigned int *atomHash_d;  
-+  uint2 *cellStartEnd_d;     
- 
-   void *safety;
-   float3 *v3f_d;
diff --git a/ospray.patch b/ospray.patch
deleted file mode 100644
index f825563d8f46..000000000000
--- a/ospray.patch
+++ /dev/null
@@ -1,53 +0,0 @@
---- src/OSPRayRenderer.C	2016-12-01 10:11:51.000000000 +0300
-+++ src/OSPRayRenderer.C	2018-03-30 18:22:52.389958174 +0300
-@@ -11,7 +11,7 @@
- *
- *      $RCSfile: OSPRayRenderer.C
- *      $Author: johns $      $Locker:  $               $State: Exp $
--*      $Revision: 1.60 $         $Date: 2016/11/28 06:00:48 $
-+*      $Revision: 1.61 $         $Date: 2017/01/10 13:57:50 $
- *
- ***************************************************************************
- * DESCRIPTION:
-@@ -659,7 +659,7 @@
-   if (ao_samples != 0)
-     lightscale = ao_direct;
- 
--  for (int i = 0; i < directional_lights.num(); ++i) {
-+  for (i = 0; i < directional_lights.num(); ++i) {
- #if 1
-     OSPLight light = ospNewLight(ospRenderer, "distant");
- #else
-@@ -2021,7 +2021,7 @@
-   ca.cylinders = (float *) calloc(1, cylnum * bytes_per_cylinder);
-   ca.colors = (float *) calloc(1, cylnum * 4 * sizeof(float));
- 
--  unsigned int i,ind4,ind6,ind7;
-+  int i,ind4,ind6,ind7;
-   const int rOffset = 6; // radius offset
-   if (wtrans == NULL) {
-     for (i=0,ind4=0,ind6=0,ind7=0; i<cylnum; i++,ind4+=4,ind6+=6,ind7+=7) {
-@@ -2469,10 +2469,7 @@
-   int i, ind, ind9, ind12;
- 
-   const float ci2f = 1.0f / 255.0f;
--  const float cn2f = 1.0f / 127.5f;
-   for (i=0,ind=0,ind9=0,ind12=0; i<numfacets; i++,ind+=3,ind9+=9,ind12+=12) {
--    float norm[9];
--
-     // transform to eye coordinates
-     wtrans.multpoint3d(v + ind9    , (float*) &mesh.v[ind9    ]);
-     wtrans.multpoint3d(v + ind9 + 3, (float*) &mesh.v[ind9 + 3]);
-@@ -2627,12 +2624,6 @@
-   // create and fill the OSPRay trimesh memory buffer
-   int i, ind, ind9, ind12;
- 
--  const rgba c = { uniform_color[0], 
--                   uniform_color[1], 
--                   uniform_color[2], 
--                   1.0f /* mat_opacity*/ };
--
--
-   for (i=0,ind=0,ind9=0,ind12=0; i<numfacets; i++,ind+=3,ind9+=9,ind12+=12) {
-     // transform to eye coordinates
-     wtrans.multpoint3d(v + ind9    , (float*) &mesh.v[ind9    ]);
author	Anton Kudelin	2020-01-23 15:28:45 +0300
committer	Anton Kudelin	2020-01-23 15:28:45 +0300
commit	b76e49a197f9ca85917562e7126903efef435a8c (patch)
tree	c7ece2e247f0b878645e599aa0ed862ac39f5125
parent	b5bc00e19b30d6afada54c62ed505af6851c1551 (diff)
download	aur-b76e49a197f9ca85917562e7126903efef435a8c.tar.gz