diff options
-rw-r--r-- | .SRCINFO | 34 | ||||
-rw-r--r-- | PKGBUILD | 73 | ||||
-rw-r--r-- | configure.patch | 110 | ||||
-rw-r--r-- | cuda.patch | 426 | ||||
-rw-r--r-- | mpi.patch | 26 | ||||
-rw-r--r-- | ospray.patch | 53 |
6 files changed, 722 insertions, 0 deletions
diff --git a/.SRCINFO b/.SRCINFO new file mode 100644 index 000000000000..8cef04b31e2a --- /dev/null +++ b/.SRCINFO @@ -0,0 +1,34 @@ +pkgbase = vmd-src + pkgdesc = Visual Molecular Dynamics + pkgver = 1.9.3 + pkgrel = 1 + url = http://www.ks.uiuc.edu/Research/vmd/ + arch = x86_64 + license = custom + makedepends = gcc + depends = tcsh + depends = tk + depends = python2-numpy + depends = fltk + depends = ospray + depends = netcdf + depends = ocl-icd + optdepends = openbabel: additional file formats support + optdepends = sqlite: dmsplugin + optdepends = optix: accelerated ray tracing for NVIDIA GPUs + optdepends = cuda: NVIDIA CUDA GPU acceleration functions + conflicts = vmd + conflicts = vmd-bin + source = local://vmd-1.9.3.src.tar.gz + source = configure.patch + source = ospray.patch + source = cuda.patch + source = mpi.patch + sha256sums = 5bdc314dc836d620fe510ed4b6c3dbe3cf66525b61680ffec4e2563cf495f128 + sha256sums = c25475d51be75c2b9e3bcd5f8d5d3ed1fd0df992f9c8020b0346c49f6c40b275 + sha256sums = 09f1cf35ebd302095a68d2b5db03e1671493411aed3f839baf2b1f44dc7e0e0d + sha256sums = 8880006bd97f7397d5a5592aaeeb6e80530d8c6026280c1b961e43f568e12daa + sha256sums = e281a57831b8ff60c5a644219f0b6289d32bee239978af676474941c7d8548c0 + +pkgname = vmd-src + diff --git a/PKGBUILD b/PKGBUILD new file mode 100644 index 000000000000..fc9c5493cfe0 --- /dev/null +++ b/PKGBUILD @@ -0,0 +1,73 @@ +# Maintainer: Anton Kudelin <kudelin at protonmail dot com> +# Contributor: Eric Berquist <eric DOT berquist AT gmail> +# Contributor: steabert <steabert@member.fsf.org> +# Contributor: Ricardo Honorato Z. + +pkgname=vmd-src +_pkgname=vmd +pkgver=1.9.3 +pkgrel=1 +pkgdesc="Visual Molecular Dynamics" +url="http://www.ks.uiuc.edu/Research/vmd/" +license=('custom') +arch=('x86_64') +depends=('tcsh' 'tk' 'python2-numpy' 'fltk' 'ospray' 'netcdf' 'ocl-icd') +makedepends=('gcc') +optdepends=('openbabel: additional file formats support' + 'sqlite: dmsplugin' + 'optix: accelerated ray tracing for NVIDIA GPUs' + 'cuda: NVIDIA CUDA GPU acceleration functions') +conflicts=("$_pkgname" "$_pkgname-bin") +# You have to download the package from the VMD url and put it in the PKGBUILD folder. +source=("local://$_pkgname-${pkgver}.src.tar.gz" + "configure.patch" + "ospray.patch" + "cuda.patch" + "mpi.patch") +sha256sums=('5bdc314dc836d620fe510ed4b6c3dbe3cf66525b61680ffec4e2563cf495f128' + 'c25475d51be75c2b9e3bcd5f8d5d3ed1fd0df992f9c8020b0346c49f6c40b275' + '09f1cf35ebd302095a68d2b5db03e1671493411aed3f839baf2b1f44dc7e0e0d' + '8880006bd97f7397d5a5592aaeeb6e80530d8c6026280c1b961e43f568e12daa' + 'e281a57831b8ff60c5a644219f0b6289d32bee239978af676474941c7d8548c0') + +prepare() { + sed -i 's/ltcl8.5/ltcl/g' plugins/Make-arch + cd $_pkgname-$pkgver + mkdir plugins + sed -i 's#:${LD_LIBRARY_PATH}/:${LD_LIBRARY_PATH}:#/opt/optix/lib64#g' bin/* + patch -p0 < ../configure.patch + patch -p0 < ../ospray.patch + patch -p0 < ../cuda.patch + + # Assuming openmpi; if not the case edit mpi.patch + patch -p0 < ../mpi.patch + export TCLINC="-I/usr/include" + export TCLLIB="-L/usr/lib" + export PLUGINDIR=$srcdir/$_pkgname-$pkgver/plugins + export VMDINSTALLBINDIR=$pkgdir/usr/bin + export VMDINSTALLLIBRARYDIR=$pkgdir/usr/lib/vmd + + # Enable CUDA if nvcc is in PATH + if [ $( echo -n $( which nvcc) | tail -c 4 ) == nvcc ] + then + export ACC=CUDA + fi +} + +build() { + cd $srcdir/plugins + make -j1 LINUXAMD64 world + make distrib + cd ../$_pkgname-$pkgver + ./configure LINUXAMD64 OPENGL EGLPBUFFER FLTKOPENGL FLTK TK $ACC IMD OPENCL MPI XINERAMA XINPUT LIBOSPRAY LIBPNG NETCDF COLVARS TCL PYTHON PTHREADS GCC + cd src + make veryclean + make +} + +package() { + cd $srcdir/$_pkgname-$pkgver + install -D -m644 LICENSE $pkgdir/usr/share/licenses/$pkgname/LICENSE + cd src; make install + sed -i 's#set defaultvmddir=.*#set defaultvmddir=/usr/lib/vmd#' $pkgdir/usr/bin/vmd +} diff --git a/configure.patch b/configure.patch new file mode 100644 index 000000000000..ad6dd0acb5e3 --- /dev/null +++ b/configure.patch @@ -0,0 +1,110 @@ +--- configure 2016-12-01 10:11:33.000000000 +0300 ++++ configure 2019-05-24 15:40:14.742991210 +0300 +@@ -466,7 +466,7 @@ + + $arch_cc = "cc"; + $arch_ccpp = "CC"; +-$arch_nvcc = "/usr/local/cuda-8.0/bin/nvcc"; ++$arch_nvcc = "nvcc"; + $arch_nvccflags = "--ptxas-options=-v " . + "-gencode arch=compute_30,code=compute_30 " . + "-gencode arch=compute_30,code=sm_35 " . +@@ -729,8 +729,8 @@ + if ($config_tk) { $tcl_include .= " -I$stock_tk_include_dir"; } + $tcl_library = "-L$stock_tcl_library_dir"; + if ($config_tk) { $tcl_library .= " -L$stock_tk_library_dir"; } +-$tcl_libs = "-ltcl8.5"; +-if ($config_tk) { $tcl_libs = "-ltk8.5 -lX11 " . $tcl_libs; } ++$tcl_libs = "-ltcl"; ++if ($config_tk) { $tcl_libs = "-ltk -lX11 " . $tcl_libs; } + + @tcl_cc = (); + @tcl_cu = (); +@@ -888,11 +888,7 @@ + $system_dir = ""; + $system_include = "-I."; + $system_library = ""; +-if ( $config_gcc ) { +- $system_libs = "-ll -lm"; +-} else { +- $system_libs = "-lm"; +-} ++$system_libs = "-lrt -lm"; + @system_cc = (); + @system_cu = (); + @system_ccpp = (); +@@ -922,7 +918,7 @@ + # This option enables the use of CUDA GPU acceleration functions. + ####################### + $cuda_defines = "-DVMDCUDA -DMSMPOT_CUDA"; +-$cuda_dir = "/usr/local/cuda-8.0"; ++$cuda_dir = "/opt/cuda"; + $cuda_include = ""; + $cuda_library = ""; + $cuda_libs = "-Wl,-rpath -Wl,\$\$ORIGIN/ -lcudart_static"; +@@ -1042,7 +1038,7 @@ + # $liboptix_dir = "/usr/local/encap/NVIDIA-OptiX-SDK-3.6.3-linux64"; + # $liboptix_dir = "/usr/local/encap/NVIDIA-OptiX-SDK-3.8.0-linux64"; + # $liboptix_dir = "/usr/local/encap/NVIDIA-OptiX-SDK-3.9.0-linux64"; +-$liboptix_dir = "/usr/local/encap/NVIDIA-OptiX-SDK-4.0.1-linux64"; ++$liboptix_dir = "/opt/optix"; + # NCSA Blue Waters + # $liboptix_dir = "/u/sciteam/stonej/local/NVIDIA-OptiX-SDK-3.6.3-linux64"; + # $liboptix_dir = "/u/sciteam/stonej/local/NVIDIA-OptiX-SDK-3.8.0-linux64"; +@@ -1082,7 +1078,7 @@ + $libospray_dir = "/usr/local/ospray-1.1.1.x86_64.linux"; + $libospray_include = "-I$libospray_dir/include -I$libospray_dir/ospray/include "; + $libospray_library = "-L$libospray_dir/lib "; +-$libospray_libs = "-lospray -lospray_common -lembree -ltbb -ltbbmalloc "; ++$libospray_libs = "-lospray -lospray_common -lembree3 -ltbb -ltbbmalloc "; + # Both OptiX and OSPRay renderers use the Tachyon glwin code, + # so we have to make sure we don't build/link it twice + if ($config_liboptix) { +@@ -1375,20 +1371,20 @@ + ################### + # location of Python library and include file. + # If left blank, standard system directories will be searched. +-#$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "/usr/local/include"; ++$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "/usr/include/python2.7"; + #$stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "/usr/local/lib"; +-$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "$vmd_library_dir/python/lib_$config_arch/include/python2.5"; +-$stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.5/config"; ++#$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "$vmd_library_dir/python/lib_$config_arch/include/python2.7"; ++#$stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.7/config"; + + #$stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "/usr/local/include"; + #$stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "/usr/local/lib"; +-$stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "$vmd_library_dir/numpy/lib_$config_arch/include"; +-$stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.5/site-packages/numpy/core/include"; ++#$stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "$vmd_library_dir/numpy/lib_$config_arch/include"; ++#$stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.7/site-packages/numpy/core/include"; + + $python_defines = "-DVMDPYTHON"; + $python_include = "-I$stock_python_include_dir -I$stock_numpy_include_dir -I$stock_numpy_library_dir"; + $python_library = "-L$stock_python_library_dir"; +-$python_libs = "-lpython2.5 -lpthread"; ++$python_libs = "-lpython2.7 -lpthread"; + @python_h = ('PythonTextInterp.h', + 'VMDTkinterMenu.h', + 'py_commands.h', +@@ -2278,9 +2274,9 @@ + # for compiling with g++: + $arch_cc = "gcc"; + $arch_ccpp = "g++"; +- $arch_opt_flag = "-m64 -fno-for-scope -Wno-deprecated -Wall -O6 -ffast-math"; ++ $arch_opt_flag = "-m64 -fno-for-scope -Wno-deprecated -Wall -O3 -ffast-math"; + $arch_depend_flag = "-MM"; +- $arch_copts = "-m64 -Wall -O6 -ffast-math"; ++ $arch_copts = "-m64 -Wall -O3 -ffast-math"; + $arch_template_repository = "foobar"; + + # so far only STATIC version tested +@@ -2293,7 +2289,7 @@ + + if ($config_cuda) { + $arch_nvccflags .= " --machine 64 -O3 $cuda_include"; +- $cuda_library = "-L/usr/local/cuda-8.0/lib64"; ++ $cuda_library = "-L/opt/cuda/lib64"; + } + + $arch_lex = "flex"; # has problems with vendor lex diff --git a/cuda.patch b/cuda.patch new file mode 100644 index 000000000000..770360ec8d94 --- /dev/null +++ b/cuda.patch @@ -0,0 +1,426 @@ +--- src/CUDAMarchingCubes.cu 2018-03-30 18:52:25.467189457 +0300 ++++ src/CUDAMarchingCubes.cu 2018-03-30 18:52:02.387136244 +0300 +@@ -10,7 +10,7 @@ + * + * $RCSfile: CUDAMarchingCubes.cu,v $ + * $Author: johns $ $Locker: $ $State: Exp $ +- * $Revision: 1.30 $ $Date: 2016/11/28 03:04:58 $ ++ * $Revision: 1.32 $ $Date: 2018/02/15 05:15:02 $ + * + *************************************************************************** + * DESCRIPTION: +@@ -25,14 +25,17 @@ + // + // Description: This class computes an isosurface for a given density grid + // using a CUDA Marching Cubes (MC) alorithm. +-// The implementation is based on the MC demo from the +-// Nvidia GPU Computing SDK, but has been improved +-// and extended. This implementation achieves higher +-// performance by reducing the number of temporary memory +-// buffers, reduces the number of scan calls by using vector +-// integer types, and allows extraction of per-vertex normals +-// optionally computes per-vertex colors if provided with a +-// volumetric texture map. ++// ++// The implementation is loosely based on the MC demo from ++// the Nvidia GPU Computing SDK, but the design has been ++// improved and extended in several ways. ++// ++// This implementation achieves higher performance ++// by reducing the number of temporary memory ++// buffers, reduces the number of scan calls by using ++// vector integer types, and allows extraction of ++// per-vertex normals and optionally computes ++// per-vertex colors if a volumetric texture map is provided. + // + // Author: Michael Krone <michael.krone@visus.uni-stuttgart.de> + // John Stone <johns@ks.uiuc.edu> +@@ -48,7 +51,7 @@ + #include <thrust/functional.h> + + // +-// Restrict macro to make it easy to do perf tuning tess ++// Restrict macro to make it easy to do perf tuning tests + // + #if 0 + #define RESTRICT __restrict__ +@@ -171,6 +174,11 @@ + texture<float, 3, cudaReadModeElementType> volumeTex; + + // sample volume data set at a point p, p CAN NEVER BE OUT OF BOUNDS ++// XXX The sampleVolume() call underperforms vs. peak memory bandwidth ++// because we don't strictly enforce coalescing requirements in the ++// layout of the input volume presently. If we forced X/Y dims to be ++// warp-multiple it would become possible to use wider fetches and ++// a few other tricks to improve global memory bandwidth + __device__ float sampleVolume(const float * RESTRICT data, + uint3 p, uint3 gridSize) { + return data[(p.z*gridSize.x*gridSize.y) + (p.y*gridSize.x) + p.x]; +@@ -592,6 +600,30 @@ + cudaBindTextureToArray(volumeTex, d_vol, desc); + } + ++#if CUDART_VERSION >= 9000 ++// ++// XXX CUDA 9.0RC breaks the usability of Thrust scan() prefix sums when ++// used with the built-in uint2 vector integer types. To workaround ++// the problem we have to define our own type and associated conversion ++// routines etc. ++// ++ ++// XXX workaround for uint2 breakage in CUDA 9.0RC ++struct myuint2 : uint2 { ++ __host__ __device__ myuint2() : uint2(make_uint2(0, 0)) {} ++ __host__ __device__ myuint2(int val) : uint2(make_uint2(val, val)) {} ++ __host__ __device__ myuint2(uint2 val) : uint2(make_uint2(val.x, val.y)) {} ++}; ++ ++void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) { ++ const uint2 zero = make_uint2(0, 0); ++ thrust::exclusive_scan(thrust::device_ptr<myuint2>((myuint2*)input), ++ thrust::device_ptr<myuint2>((myuint2*)input + numElements), ++ thrust::device_ptr<myuint2>((myuint2*)output), ++ (myuint2) zero); ++} ++ ++#else + + void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) { + const uint2 zero = make_uint2(0, 0); +@@ -601,6 +633,7 @@ + zero); + } + ++#endif + + void ThrustScanWrapperArea(float* output, float* input, unsigned int numElements) { + thrust::inclusive_scan(thrust::device_ptr<float>(input), +@@ -639,11 +672,9 @@ + } + + +-/////////////////////////////////////////////////////////////////////////////// + // + // class CUDAMarchingCubes + // +-/////////////////////////////////////////////////////////////////////////////// + + CUDAMarchingCubes::CUDAMarchingCubes() { + // initialize values +@@ -713,9 +744,6 @@ + } + + +-//////////////////////////////////////////////////////////////////////////////// +-//! Run the Cuda part of the computation +-//////////////////////////////////////////////////////////////////////////////// + void CUDAMarchingCubes::computeIsosurfaceVerts(float3* vertOut, unsigned int maxverts, dim3 & grid3) { + // check if data is available + if (!this->setdata) + +--- src/CUDAMDFF.cu 2016-12-01 10:11:56.000000000 +0300 ++++ src/CUDAMDFF.cu 2018-03-30 18:56:44.352937599 +0300 +@@ -11,7 +11,7 @@ + * + * $RCSfile: CUDAMDFF.cu,v $ + * $Author: johns $ $Locker: $ $State: Exp $ +- * $Revision: 1.75 $ $Date: 2015/04/07 20:41:26 $ ++ * $Revision: 1.78 $ $Date: 2018/02/19 07:10:37 $ + * + *************************************************************************** + * DESCRIPTION: +@@ -28,12 +28,16 @@ + #include <stdlib.h> + #include <string.h> + #include <cuda.h> +-#include <float.h> // FLT_MAX etc +- ++#if CUDART_VERSION >= 9000 ++#include <cuda_fp16.h> // need to explicitly include for CUDA 9.0 ++#endif + #if CUDART_VERSION < 4000 + #error The VMD MDFF feature requires CUDA 4.0 or later + #endif + ++#include <float.h> // FLT_MAX etc ++ ++ + #include "Inform.h" + #include "utilities.h" + #include "WKFThreads.h" +@@ -588,6 +592,43 @@ + } + + ++ ++// #define VMDUSESHUFFLE 1 ++#if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000 ++// New warp shuffle-based CC sum reduction for Kepler and later GPUs. ++inline __device__ void cc_sumreduction(int tid, int totaltb, ++ float4 &total_cc_sums, ++ float &total_lcc, ++ int &total_lsize, ++ float4 *tb_cc_sums, ++ float *tb_lcc, ++ int *tb_lsize) { ++ total_cc_sums = make_float4(0.0f, 0.0f, 0.0f, 0.0f); ++ total_lcc = 0.0f; ++ total_lsize = 0; ++ ++ // use precisely one warp to do the final reduction ++ if (tid < warpSize) { ++ for (int i=tid; i<totaltb; i+=warpSize) { ++ total_cc_sums += tb_cc_sums[i]; ++ total_lcc += tb_lcc[i]; ++ total_lsize += tb_lsize[i]; ++ } ++ ++ // perform intra-warp parallel reduction... ++ // general loop version of parallel sum-reduction ++ for (int mask=warpSize/2; mask>0; mask>>=1) { ++ total_cc_sums.x += __shfl_xor_sync(0xffffffff, total_cc_sums.x, mask); ++ total_cc_sums.y += __shfl_xor_sync(0xffffffff, total_cc_sums.y, mask); ++ total_cc_sums.z += __shfl_xor_sync(0xffffffff, total_cc_sums.z, mask); ++ total_cc_sums.w += __shfl_xor_sync(0xffffffff, total_cc_sums.w, mask); ++ total_lcc += __shfl_xor_sync(0xffffffff, total_lcc, mask); ++ total_lsize += __shfl_xor_sync(0xffffffff, total_lsize, mask); ++ } ++ } ++} ++#else ++// shared memory based CC sum reduction + inline __device__ void cc_sumreduction(int tid, int totaltb, + float4 &total_cc_sums, + float &total_lcc, +@@ -629,6 +670,7 @@ + total_lcc = tb_lcc[0]; + total_lsize = tb_lsize[0]; + } ++#endif + + + inline __device__ void thread_cc_sum(float ref, float density, +@@ -750,6 +792,92 @@ + } + + ++#if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000 ++ // all threads write their local sums to shared memory... ++ __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ]; ++ __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ]; ++ __shared__ float tb_lcc_s[TOTALBLOCKSZ]; ++ __shared__ int tb_lsize_s[TOTALBLOCKSZ]; ++ ++ tb_cc_means_s[tid] = thread_cc_means; ++ tb_cc_squares_s[tid] = thread_cc_squares; ++ tb_lcc_s[tid] = thread_lcc; ++ tb_lsize_s[tid] = thread_lsize; ++ __syncthreads(); // all threads must hit syncthreads call... ++ ++ // use precisely one warp to do the thread-block-wide reduction ++ if (tid < warpSize) { ++ float2 tmp_cc_means = make_float2(0.0f, 0.0f); ++ float2 tmp_cc_squares = make_float2(0.0f, 0.0f); ++ float tmp_lcc = 0.0f; ++ int tmp_lsize = 0; ++ for (int i=tid; i<TOTALBLOCKSZ; i+=warpSize) { ++ tmp_cc_means += tb_cc_means_s[i]; ++ tmp_cc_squares += tb_cc_squares_s[i]; ++ tmp_lcc += tb_lcc_s[i]; ++ tmp_lsize += tb_lsize_s[i]; ++ } ++ ++ // perform intra-warp parallel reduction... ++ // general loop version of parallel sum-reduction ++ for (int mask=warpSize/2; mask>0; mask>>=1) { ++ tmp_cc_means.x += __shfl_xor_sync(0xffffffff, tmp_cc_means.x, mask); ++ tmp_cc_means.y += __shfl_xor_sync(0xffffffff, tmp_cc_means.y, mask); ++ tmp_cc_squares.x += __shfl_xor_sync(0xffffffff, tmp_cc_squares.x, mask); ++ tmp_cc_squares.y += __shfl_xor_sync(0xffffffff, tmp_cc_squares.y, mask); ++ tmp_lcc += __shfl_xor_sync(0xffffffff, tmp_lcc, mask); ++ tmp_lsize += __shfl_xor_sync(0xffffffff, tmp_lsize, mask); ++ } ++ ++ // write per-thread-block partial sums to global memory, ++ // if a per-thread-block CC output array is provided, write the ++ // local CC for this thread block out, and finally, check if we ++ // are the last thread block to finish, and finalize the overall ++ // CC results for the entire grid of thread blocks. ++ if (tid == 0) { ++ unsigned int bid = blockIdx.z * gridDim.x * gridDim.y + ++ blockIdx.y * gridDim.x + blockIdx.x; ++ ++ tb_cc_sums[bid] = make_float4(tmp_cc_means.x, tmp_cc_means.y, ++ tmp_cc_squares.x, tmp_cc_squares.y); ++ tb_lcc[bid] = tmp_lcc; ++ tb_lsize[bid] = tmp_lsize; ++ ++ if (tb_CC != NULL) { ++ float cc = calc_cc(tb_cc_means_s[0].x, tb_cc_means_s[0].y, ++ tb_cc_squares_s[0].x, tb_cc_squares_s[0].y, ++ tb_lsize_s[0], tb_lcc_s[0]); ++ ++ // write local per-thread-block CC to global memory ++ tb_CC[bid] = cc; ++ } ++ ++ __threadfence(); ++ ++ unsigned int value = atomicInc(&tbcatomic[0], totaltb); ++ isLastBlockDone = (value == (totaltb - 1)); ++ } ++ } ++ __syncthreads(); ++ ++ if (isLastBlockDone) { ++ float4 total_cc_sums; ++ float total_lcc; ++ int total_lsize; ++ cc_sumreduction(tid, totaltb, total_cc_sums, total_lcc, total_lsize, ++ tb_cc_sums, tb_lcc, tb_lsize); ++ ++ if (tid == 0) { ++ tb_cc_sums[totaltb] = total_cc_sums; ++ tb_lcc[totaltb] = total_lcc; ++ tb_lsize[totaltb] = total_lsize; ++ } ++ ++ reset_atomic_counter(&tbcatomic[0]); ++ } ++ ++#else ++ + // all threads write their local sums to shared memory... + __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ]; + __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ]; +@@ -794,6 +922,7 @@ + } + __syncthreads(); // all threads must hit syncthreads call... + } ++//#endif + + // write per-thread-block partial sums to global memory, + // if a per-thread-block CC output array is provided, write the +@@ -847,6 +976,7 @@ + } + #endif + } ++#endif + } + + + +--- src/CUDAQuickSurf.cu 2016-12-01 10:11:56.000000000 +0300 ++++ src/CUDAQuickSurf.cu 2018-03-30 19:01:38.777196233 +0300 +@@ -11,7 +11,7 @@ + * + * $RCSfile: CUDAQuickSurf.cu,v $ + * $Author: johns $ $Locker: $ $State: Exp $ +- * $Revision: 1.81 $ $Date: 2016/04/20 04:57:46 $ ++ * $Revision: 1.84 $ $Date: 2018/02/15 04:59:15 $ + * + *************************************************************************** + * DESCRIPTION: +@@ -22,6 +22,9 @@ + #include <stdlib.h> + #include <string.h> + #include <cuda.h> ++#if CUDART_VERSION >= 9000 ++#include <cuda_fp16.h> // need to explicitly include for CUDA 9.0 ++#endif + + #if CUDART_VERSION < 4000 + #error The VMD QuickSurf feature requires CUDA 4.0 or later +@@ -130,14 +133,14 @@ + #define GUNROLL 1 + #endif + +-#if __CUDA_ARCH__ >= 300 + #define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ ) +-#define MINBLOCKDENS 1 ++#if __CUDA_ARCH__ >= 600 ++#define MINBLOCKDENS 16 ++#elif __CUDA_ARCH__ >= 300 ++#define MINBLOCKDENS 16 + #elif __CUDA_ARCH__ >= 200 +-#define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ ) + #define MINBLOCKDENS 1 + #else +-#define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ ) + #define MINBLOCKDENS 1 + #endif + +@@ -150,7 +153,7 @@ + // + template<class DENSITY, class VOLTEX> + __global__ static void +-// __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) ++__launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) + gaussdensity_fast_tex_norm(int natoms, + const float4 * RESTRICT sorted_xyzr, + const float4 * RESTRICT sorted_color, +@@ -217,6 +220,8 @@ + for (yab=yabmin; yab<=yabmax; yab++) { + for (xab=xabmin; xab<=xabmax; xab++) { + int abcellidx = zab * acplanesz + yab * acncells.x + xab; ++ // this biggest latency hotspot in the kernel, if we could improve ++ // packing of the grid cell map, we'd likely improve performance + uint2 atomstartend = cellStartEnd[abcellidx]; + if (atomstartend.x != GRID_CELL_EMPTY) { + unsigned int atomid; +@@ -296,7 +301,7 @@ + + + __global__ static void +-// __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) ++__launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) + gaussdensity_fast_tex3f(int natoms, + const float4 * RESTRICT sorted_xyzr, + const float4 * RESTRICT sorted_color, +@@ -363,6 +368,8 @@ + for (yab=yabmin; yab<=yabmax; yab++) { + for (xab=xabmin; xab<=xabmax; xab++) { + int abcellidx = zab * acplanesz + yab * acncells.x + xab; ++ // this biggest latency hotspot in the kernel, if we could improve ++ // packing of the grid cell map, we'd likely improve performance + uint2 atomstartend = cellStartEnd[abcellidx]; + if (atomstartend.x != GRID_CELL_EMPTY) { + unsigned int atomid; +@@ -550,7 +557,6 @@ + + // per-GPU handle with various memory buffer pointers, etc. + typedef struct { +- /// max grid sizes and attributes the current allocations will support + int verbose; + long int natoms; + int colorperatom; +@@ -561,18 +567,18 @@ + int gy; + int gz; + +- CUDAMarchingCubes *mc; ///< Marching cubes class used to extract surface ++ CUDAMarchingCubes *mc; + +- float *devdensity; ///< density map stored in GPU memory +- void *devvoltexmap; ///< volumetric texture map +- float4 *xyzr_d; ///< atom coords and radii +- float4 *sorted_xyzr_d; ///< cell-sorted coords and radii +- float4 *color_d; ///< colors +- float4 *sorted_color_d; ///< cell-sorted colors +- +- unsigned int *atomIndex_d; ///< cell index for each atom +- unsigned int *atomHash_d; ///< +- uint2 *cellStartEnd_d; ///< cell start/end indices ++ float *devdensity; ++ void *devvoltexmap; ++ float4 *xyzr_d; ++ float4 *sorted_xyzr_d; ++ float4 *color_d; ++ float4 *sorted_color_d; ++ ++ unsigned int *atomIndex_d; ++ unsigned int *atomHash_d; ++ uint2 *cellStartEnd_d; + + void *safety; + float3 *v3f_d; diff --git a/mpi.patch b/mpi.patch new file mode 100644 index 000000000000..b3822a31f665 --- /dev/null +++ b/mpi.patch @@ -0,0 +1,26 @@ +--- configure 2019-05-24 15:48:39.590316691 +0300 ++++ configure 2019-05-24 15:48:07.136424874 +0300 +@@ -970,15 +970,15 @@ + ####################### + $mpi_defines = "-DVMDMPI "; + ## Argonne MPICH +-$mpi_dir = "/usr/lib64/mpich"; +-$mpi_include = "-I/usr/include/mpich-x86_64"; +-$mpi_library = "-L$mpi_dir/lib"; +-$mpi_libs = "-lmpich"; ++#$mpi_dir = "/usr"; ++#$mpi_include = "-I/usr/include"; ++#$mpi_library = "-L$mpi_dir/lib"; ++#$mpi_libs = "-lmpicxx -lmpi"; + ## OpenMPI +-# $mpi_dir = "/usr/lib64/openmpi"; +-# $mpi_include = "-I/usr/include/openmpi-x86_64"; +-# $mpi_library = "-L$mpi_dir/lib"; +-# $mpi_libs = "-lmpi"; ++$mpi_dir = "/usr"; ++$mpi_include = "-I/usr/include"; ++$mpi_library = "-L$mpi_dir/lib/openmpi"; ++$mpi_libs = "-lmpi_cxx -lmpi"; + ## OSU MVAPICH + # $mpi_dir = "/usr/mpi/gcc/mvapich-1.0.0"; + # $mpi_include = "-I$mpi_dir/include"; diff --git a/ospray.patch b/ospray.patch new file mode 100644 index 000000000000..f825563d8f46 --- /dev/null +++ b/ospray.patch @@ -0,0 +1,53 @@ +--- src/OSPRayRenderer.C 2016-12-01 10:11:51.000000000 +0300 ++++ src/OSPRayRenderer.C 2018-03-30 18:22:52.389958174 +0300 +@@ -11,7 +11,7 @@ + * + * $RCSfile: OSPRayRenderer.C + * $Author: johns $ $Locker: $ $State: Exp $ +-* $Revision: 1.60 $ $Date: 2016/11/28 06:00:48 $ ++* $Revision: 1.61 $ $Date: 2017/01/10 13:57:50 $ + * + *************************************************************************** + * DESCRIPTION: +@@ -659,7 +659,7 @@ + if (ao_samples != 0) + lightscale = ao_direct; + +- for (int i = 0; i < directional_lights.num(); ++i) { ++ for (i = 0; i < directional_lights.num(); ++i) { + #if 1 + OSPLight light = ospNewLight(ospRenderer, "distant"); + #else +@@ -2021,7 +2021,7 @@ + ca.cylinders = (float *) calloc(1, cylnum * bytes_per_cylinder); + ca.colors = (float *) calloc(1, cylnum * 4 * sizeof(float)); + +- unsigned int i,ind4,ind6,ind7; ++ int i,ind4,ind6,ind7; + const int rOffset = 6; // radius offset + if (wtrans == NULL) { + for (i=0,ind4=0,ind6=0,ind7=0; i<cylnum; i++,ind4+=4,ind6+=6,ind7+=7) { +@@ -2469,10 +2469,7 @@ + int i, ind, ind9, ind12; + + const float ci2f = 1.0f / 255.0f; +- const float cn2f = 1.0f / 127.5f; + for (i=0,ind=0,ind9=0,ind12=0; i<numfacets; i++,ind+=3,ind9+=9,ind12+=12) { +- float norm[9]; +- + // transform to eye coordinates + wtrans.multpoint3d(v + ind9 , (float*) &mesh.v[ind9 ]); + wtrans.multpoint3d(v + ind9 + 3, (float*) &mesh.v[ind9 + 3]); +@@ -2627,12 +2624,6 @@ + // create and fill the OSPRay trimesh memory buffer + int i, ind, ind9, ind12; + +- const rgba c = { uniform_color[0], +- uniform_color[1], +- uniform_color[2], +- 1.0f /* mat_opacity*/ }; +- +- + for (i=0,ind=0,ind9=0,ind12=0; i<numfacets; i++,ind+=3,ind9+=9,ind12+=12) { + // transform to eye coordinates + wtrans.multpoint3d(v + ind9 , (float*) &mesh.v[ind9 ]); |