diff options
-rw-r--r-- | .SRCINFO | 27 | ||||
-rw-r--r-- | PKGBUILD | 96 | ||||
-rw-r--r-- | configure.patch | 202 | ||||
-rw-r--r-- | cuda.patch | 426 | ||||
-rw-r--r-- | mpi.patch | 26 | ||||
-rw-r--r-- | ospray.patch | 53 |
6 files changed, 217 insertions, 613 deletions
@@ -1,34 +1,31 @@ pkgbase = vmd-src pkgdesc = Visual Molecular Dynamics - pkgver = 1.9.3 - pkgrel = 1 + pkgver = 1.9.4a57 + pkgrel = 4 url = http://www.ks.uiuc.edu/Research/vmd/ arch = x86_64 + arch = aarch64 license = custom - makedepends = gcc + makedepends = opencl-headers depends = tcsh depends = tk - depends = python2-numpy + depends = python-numpy depends = fltk - depends = ospray depends = netcdf depends = ocl-icd + depends = libxi + depends = openmpi optdepends = openbabel: additional file formats support optdepends = sqlite: dmsplugin + optdepends = ospray: accelerated ray tracing for Intel CPUs optdepends = optix: accelerated ray tracing for NVIDIA GPUs optdepends = cuda: NVIDIA CUDA GPU acceleration functions + provides = vmd conflicts = vmd conflicts = vmd-bin - source = local://vmd-1.9.3.src.tar.gz + source = local://vmd-1.9.4a57.src.tar.gz source = configure.patch - source = ospray.patch - source = cuda.patch - source = mpi.patch - sha256sums = 5bdc314dc836d620fe510ed4b6c3dbe3cf66525b61680ffec4e2563cf495f128 - sha256sums = c25475d51be75c2b9e3bcd5f8d5d3ed1fd0df992f9c8020b0346c49f6c40b275 - sha256sums = 09f1cf35ebd302095a68d2b5db03e1671493411aed3f839baf2b1f44dc7e0e0d - sha256sums = 8880006bd97f7397d5a5592aaeeb6e80530d8c6026280c1b961e43f568e12daa - sha256sums = e281a57831b8ff60c5a644219f0b6289d32bee239978af676474941c7d8548c0 + sha256sums = de278d0c5d969336d89068e0806fb50aaa0cb0f546ba985d840b279357860679 + sha256sums = a74a8bbee40667742907b59aa24bdb37607761389a9c332c2d449ef07a2f0937 pkgname = vmd-src - @@ -1,46 +1,40 @@ -# Maintainer: Anton Kudelin <kudelin at protonmail dot com> +# Maintainer: Anton Kudelin <kudelin at protonmail dot com> # Contributor: Eric Berquist <eric DOT berquist AT gmail> # Contributor: steabert <steabert@member.fsf.org> # Contributor: Ricardo Honorato Z. pkgname=vmd-src _pkgname=vmd -pkgver=1.9.3 -pkgrel=1 +pkgver=1.9.4a57 +pkgrel=4 pkgdesc="Visual Molecular Dynamics" url="http://www.ks.uiuc.edu/Research/vmd/" -license=('custom') -arch=('x86_64') -depends=('tcsh' 'tk' 'python2-numpy' 'fltk' 'ospray' 'netcdf' 'ocl-icd') -makedepends=('gcc') +license=(custom) +arch=(x86_64 aarch64) +depends=(tcsh tk python-numpy fltk netcdf ocl-icd libxi openmpi) +makedepends=(opencl-headers) optdepends=('openbabel: additional file formats support' 'sqlite: dmsplugin' + 'ospray: accelerated ray tracing for Intel CPUs' 'optix: accelerated ray tracing for NVIDIA GPUs' 'cuda: NVIDIA CUDA GPU acceleration functions') -conflicts=("$_pkgname" "$_pkgname-bin") -# You have to download the package from the VMD url and put it in the PKGBUILD folder. -source=("local://$_pkgname-${pkgver}.src.tar.gz" - "configure.patch" - "ospray.patch" - "cuda.patch" - "mpi.patch") -sha256sums=('5bdc314dc836d620fe510ed4b6c3dbe3cf66525b61680ffec4e2563cf495f128' - 'c25475d51be75c2b9e3bcd5f8d5d3ed1fd0df992f9c8020b0346c49f6c40b275' - '09f1cf35ebd302095a68d2b5db03e1671493411aed3f839baf2b1f44dc7e0e0d' - '8880006bd97f7397d5a5592aaeeb6e80530d8c6026280c1b961e43f568e12daa' - 'e281a57831b8ff60c5a644219f0b6289d32bee239978af676474941c7d8548c0') +provides=($_pkgname) +conflicts=($_pkgname $_pkgname-bin) +# You have to download the package from the VMD url +# and put it in the PKGBUILD folder. +source=(local://$_pkgname-${pkgver}.src.tar.gz + configure.patch) +sha256sums=('de278d0c5d969336d89068e0806fb50aaa0cb0f546ba985d840b279357860679' + 'a74a8bbee40667742907b59aa24bdb37607761389a9c332c2d449ef07a2f0937') prepare() { sed -i 's/ltcl8.5/ltcl/g' plugins/Make-arch cd $_pkgname-$pkgver mkdir plugins sed -i 's#:${LD_LIBRARY_PATH}/:${LD_LIBRARY_PATH}:#/opt/optix/lib64#g' bin/* + # Assuming openmpi; if it's not the case edit configure.patch patch -p0 < ../configure.patch - patch -p0 < ../ospray.patch - patch -p0 < ../cuda.patch - # Assuming openmpi; if not the case edit mpi.patch - patch -p0 < ../mpi.patch export TCLINC="-I/usr/include" export TCLLIB="-L/usr/lib" export PLUGINDIR=$srcdir/$_pkgname-$pkgver/plugins @@ -48,26 +42,66 @@ prepare() { export VMDINSTALLLIBRARYDIR=$pkgdir/usr/lib/vmd # Enable CUDA if nvcc is in PATH - if [ $( echo -n $( which nvcc) | tail -c 4 ) == nvcc ] + if command -v nvcc &> /dev/null then export ACC=CUDA fi + + # Enable OSPRAY if ospray_mpi_worker is in PATH + if command -v ospray_mpi_worker &> /dev/null + then + export RAY=LIBOSPRAY2 + fi + + # Architecture + export MACHINE=LINUXAMD64 + if [ $CARCH == 'aarch64' ] + then + export MACHINE=LINUXARM64 + fi } build() { - cd $srcdir/plugins - make -j1 LINUXAMD64 world + cd "$srcdir/plugins" + make -j1 $MACHINE make distrib + cd ../$_pkgname-$pkgver - ./configure LINUXAMD64 OPENGL EGLPBUFFER FLTKOPENGL FLTK TK $ACC IMD OPENCL MPI XINERAMA XINPUT LIBOSPRAY LIBPNG NETCDF COLVARS TCL PYTHON PTHREADS GCC + ./configure \ + $MACHINE \ + $ACC \ + $RAY \ + OPENGL \ + EGLPBUFFER \ + FLTKOPENGL \ + FLTK \ + TK \ + IMD \ + OPENCL \ + MPI \ + XINERAMA \ + XINPUT \ + LIBPNG \ + ZLIB \ + NETCDF \ + COLVARS \ + TCL \ + PYTHON \ + NUMPY \ + PTHREADS \ + GCC + cd src make veryclean make } package() { - cd $srcdir/$_pkgname-$pkgver - install -D -m644 LICENSE $pkgdir/usr/share/licenses/$pkgname/LICENSE - cd src; make install - sed -i 's#set defaultvmddir=.*#set defaultvmddir=/usr/lib/vmd#' $pkgdir/usr/bin/vmd + cd "$srcdir/$_pkgname-$pkgver" + install -Dm644 LICENSE "$pkgdir/usr/share/licenses/$pkgname/LICENSE" + + cd src + make install + sed -i 's#set defaultvmddir=.*#set defaultvmddir=/usr/lib/vmd#' \ + "$pkgdir/usr/bin/vmd" } diff --git a/configure.patch b/configure.patch index ad6dd0acb5e3..106fc6998ad0 100644 --- a/configure.patch +++ b/configure.patch @@ -1,15 +1,34 @@ ---- configure 2016-12-01 10:11:33.000000000 +0300 -+++ configure 2019-05-24 15:40:14.742991210 +0300 -@@ -466,7 +466,7 @@ +--- configure 2021-08-08 10:32:02.637988859 +0000 ++++ configure 2021-08-08 10:31:14.808470930 +0000 +@@ -497,19 +497,19 @@ $arch_cc = "cc"; $arch_ccpp = "CC"; --$arch_nvcc = "/usr/local/cuda-8.0/bin/nvcc"; +-$arch_nvcc = "/usr/local/cuda-10.2/bin/nvcc"; +$arch_nvcc = "nvcc"; - $arch_nvccflags = "--ptxas-options=-v " . - "-gencode arch=compute_30,code=compute_30 " . - "-gencode arch=compute_30,code=sm_35 " . -@@ -729,8 +729,8 @@ + $arch_nvccflags = "-lineinfo --ptxas-options=-v " . +- "-gencode arch=compute_30,code=compute_30 " . +- "-gencode arch=compute_30,code=sm_35 " . +- "-gencode arch=compute_30,code=sm_37 " . +- "-gencode arch=compute_50,code=compute_50 " . ++ "-gencode arch=compute_35,code=sm_35 " . ++ "-gencode arch=compute_35,code=sm_37 " . + "-gencode arch=compute_50,code=sm_50 " . +- "-gencode arch=compute_60,code=compute_60 " . ++ "-gencode arch=compute_52,code=sm_52 " . + "-gencode arch=compute_60,code=sm_60 " . +- "-gencode arch=compute_70,code=compute_70 " . ++ "-gencode arch=compute_61,code=sm_61 " . + "-gencode arch=compute_70,code=sm_70 " . ++ "-gencode arch=compute_75,code=sm_75 " . ++ "-gencode arch=compute_80,code=sm_80 " . ++ "-gencode arch=compute_86,code=sm_86 " . + "--ftz=true "; +-# "-gencode arch=compute_75,code=sm_75 " . + $arch_gcc = "gcc"; + $arch_gccpp = "g++"; + $arch_lex = "lex"; +@@ -782,8 +782,8 @@ if ($config_tk) { $tcl_include .= " -I$stock_tk_include_dir"; } $tcl_library = "-L$stock_tcl_library_dir"; if ($config_tk) { $tcl_library .= " -L$stock_tk_library_dir"; } @@ -20,91 +39,150 @@ @tcl_cc = (); @tcl_cu = (); -@@ -888,11 +888,7 @@ +@@ -911,13 +911,7 @@ $system_dir = ""; $system_include = "-I."; $system_library = ""; -if ( $config_gcc ) { -- $system_libs = "-ll -lm"; +- ## Note: some old commercial versions of Unix require -ll for lex internals +- ## now that we've been using flex for years, -ll should be moot. +- $system_libs = "-lm"; -} else { - $system_libs = "-lm"; -} -+$system_libs = "-lrt -lm"; ++$system_libs = "-lrt -lm"; @system_cc = (); @system_cu = (); @system_ccpp = (); -@@ -922,7 +918,7 @@ +@@ -992,10 +986,10 @@ # This option enables the use of CUDA GPU acceleration functions. ####################### $cuda_defines = "-DVMDCUDA -DMSMPOT_CUDA"; --$cuda_dir = "/usr/local/cuda-8.0"; +-$cuda_dir = "/usr/local/cuda-10.2"; +$cuda_dir = "/opt/cuda"; $cuda_include = ""; $cuda_library = ""; - $cuda_libs = "-Wl,-rpath -Wl,\$\$ORIGIN/ -lcudart_static"; -@@ -1042,7 +1038,7 @@ - # $liboptix_dir = "/usr/local/encap/NVIDIA-OptiX-SDK-3.6.3-linux64"; - # $liboptix_dir = "/usr/local/encap/NVIDIA-OptiX-SDK-3.8.0-linux64"; - # $liboptix_dir = "/usr/local/encap/NVIDIA-OptiX-SDK-3.9.0-linux64"; --$liboptix_dir = "/usr/local/encap/NVIDIA-OptiX-SDK-4.0.1-linux64"; +-$cuda_libs = "-Wl,-rpath -Wl,\$\$ORIGIN/ -lcudart_static -lrt"; ++$cuda_libs = "-Wl,-rpath -Wl,\$\$ORIGIN/ -lcudart -lrt"; + @cuda_cc = (); + @cuda_cu = ('msmpot_cuda.cu', + 'msmpot_cuda_latcut.cu', +@@ -1052,15 +1046,15 @@ + ####################### + $mpi_defines = "-DVMDMPI "; + ## Argonne MPICH +-$mpi_dir = "/usr/lib64/mpich"; +-$mpi_include = "-I/usr/include/mpich-x86_64"; +-$mpi_library = "-L$mpi_dir/lib"; +-$mpi_libs = "-lmpich"; ++#$mpi_dir = "/usr"; ++#$mpi_include = "-I/usr/include"; ++#$mpi_library = "-L$mpi_dir/lib"; ++#$mpi_libs = "-lmpicxx -lmpi"; + ## OpenMPI +-# $mpi_dir = "/usr/lib64/openmpi"; +-# $mpi_include = "-I/usr/include/openmpi-x86_64"; +-# $mpi_library = "-L$mpi_dir/lib"; +-# $mpi_libs = "-lmpi"; ++$mpi_dir = "/usr"; ++$mpi_include = "-I/usr/include"; ++$mpi_library = "-L$mpi_dir/lib"; ++$mpi_libs = "-lmpi"; + ## OSU MVAPICH + # $mpi_dir = "/usr/mpi/gcc/mvapich-1.0.0"; + # $mpi_include = "-I$mpi_dir/include"; +@@ -1201,7 +1195,7 @@ + # $liboptix_dir = "/usr/local/encap/NVIDIA-OptiX-SDK-5.0.1-linux64"; + # $liboptix_dir = "/usr/local/encap/NVIDIA-OptiX-SDK-5.1.0-linux64"; + # $liboptix_dir = "/usr/local/encap/NVIDIA-OptiX-SDK-6.0.0-linux64"; +-$liboptix_dir = "/usr/local/encap/NVIDIA-OptiX-SDK-6.5.0-linux64"; +$liboptix_dir = "/opt/optix"; + # $liboptix_dir = "/usr/local/encap/NVIDIA-OptiX-SDK-7.0.0-linux64"; + # NCSA Blue Waters - # $liboptix_dir = "/u/sciteam/stonej/local/NVIDIA-OptiX-SDK-3.6.3-linux64"; - # $liboptix_dir = "/u/sciteam/stonej/local/NVIDIA-OptiX-SDK-3.8.0-linux64"; -@@ -1082,7 +1078,7 @@ - $libospray_dir = "/usr/local/ospray-1.1.1.x86_64.linux"; - $libospray_include = "-I$libospray_dir/include -I$libospray_dir/ospray/include "; - $libospray_library = "-L$libospray_dir/lib "; --$libospray_libs = "-lospray -lospray_common -lembree -ltbb -ltbbmalloc "; -+$libospray_libs = "-lospray -lospray_common -lembree3 -ltbb -ltbbmalloc "; - # Both OptiX and OSPRay renderers use the Tachyon glwin code, - # so we have to make sure we don't build/link it twice - if ($config_liboptix) { -@@ -1375,20 +1371,20 @@ - ################### - # location of Python library and include file. - # If left blank, standard system directories will be searched. --#$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "/usr/local/include"; -+$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "/usr/include/python2.7"; - #$stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "/usr/local/lib"; --$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "$vmd_library_dir/python/lib_$config_arch/include/python2.5"; --$stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.5/config"; -+#$stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "$vmd_library_dir/python/lib_$config_arch/include/python2.7"; -+#$stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.7/config"; +@@ -1314,7 +1308,7 @@ + $libospray2_dir = "/usr/local/ospray-2.1.1.x86_64.linux"; + $libospray2_include = "-I$libospray2_dir/include -I$libospray2_dir/ospray/include "; + $libospray2_library = "-L$libospray2_dir/lib "; +- $libospray2_libs = "-lospray -lospcommon -lembree3 -ltbb -ltbbmalloc "; ++ $libospray2_libs = "-lospray -lembree3 -ltbb -ltbbmalloc "; + } - #$stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "/usr/local/include"; - #$stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "/usr/local/lib"; --$stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "$vmd_library_dir/numpy/lib_$config_arch/include"; --$stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.5/site-packages/numpy/core/include"; -+#$stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "$vmd_library_dir/numpy/lib_$config_arch/include"; -+#$stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.7/site-packages/numpy/core/include"; + # Both OptiX and OSPRay renderers use the Tachyon glwin code, +@@ -1650,16 +1644,16 @@ + $stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "$conda_root/lib/python-3.7/site-packages/numpy/core/include"; + $python_libs = "-fno-lto -lpython3.7m -lpthread"; + } else { +-# $stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "/usr/local/include"; +-# $stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "/usr/local/lib"; +- $stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "$vmd_library_dir/python/lib_$config_arch/include/python2.5"; +- $stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.5/config"; +- +-# $stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "/usr/local/include"; +-# $stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "/usr/local/lib"; +- $stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "$vmd_library_dir/numpy/lib_$config_arch/include"; +- $stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.5/site-packages/numpy/core/include"; +- $python_libs = "-lpython2.5 -lpthread"; ++ $stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "/usr/include/python3.12"; ++ $stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "/usr/lib/python3.12/config-3.12-x86_64-linux-gnu"; ++# $stock_python_include_dir=$ENV{"PYTHON_INCLUDE_DIR"} || "$vmd_library_dir/python/lib_$config_arch/include/python2.5"; ++# $stock_python_library_dir=$ENV{"PYTHON_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.5/config"; ++ ++ $stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "/usr/lib/python3.12/site-packages/numpy/core/include/numpy"; ++ $stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "/usr/lib/python3.12/site-packages/numpy/core/include"; ++# $stock_numpy_include_dir=$ENV{"NUMPY_INCLUDE_DIR"} || "$vmd_library_dir/numpy/lib_$config_arch/include"; ++# $stock_numpy_library_dir=$ENV{"NUMPY_LIBRARY_DIR"} || "$vmd_library_dir/python/lib_$config_arch/lib/python2.5/site-packages/numpy/core/include"; ++ $python_libs = "-lpython3.12 -lpthread"; + } $python_defines = "-DVMDPYTHON"; - $python_include = "-I$stock_python_include_dir -I$stock_numpy_include_dir -I$stock_numpy_library_dir"; - $python_library = "-L$stock_python_library_dir"; --$python_libs = "-lpython2.5 -lpthread"; -+$python_libs = "-lpython2.7 -lpthread"; - @python_h = ('PythonTextInterp.h', - 'VMDTkinterMenu.h', - 'py_commands.h', -@@ -2278,9 +2274,9 @@ +@@ -2559,9 +2553,9 @@ # for compiling with g++: $arch_cc = "gcc"; $arch_ccpp = "g++"; -- $arch_opt_flag = "-m64 -fno-for-scope -Wno-deprecated -Wall -O6 -ffast-math"; -+ $arch_opt_flag = "-m64 -fno-for-scope -Wno-deprecated -Wall -O3 -ffast-math"; +- $arch_opt_flag = "-m64 -fno-for-scope -Wno-deprecated -Wall -Wno-unknown-pragmas -O6 -ffast-math"; ++ $arch_opt_flag = "-Ofast -march=native"; $arch_depend_flag = "-MM"; -- $arch_copts = "-m64 -Wall -O6 -ffast-math"; -+ $arch_copts = "-m64 -Wall -O3 -ffast-math"; - $arch_template_repository = "foobar"; +- $arch_copts = "-m64 -Wall -Wno-unknown-pragmas -O6 -ffast-math"; ++ $arch_copts = "-Ofast -march=native"; # so far only STATIC version tested -@@ -2293,7 +2289,7 @@ + if ($config_static) { +@@ -2573,7 +2567,7 @@ if ($config_cuda) { $arch_nvccflags .= " --machine 64 -O3 $cuda_include"; -- $cuda_library = "-L/usr/local/cuda-8.0/lib64"; +- $cuda_library = "-L/usr/local/cuda-10.2/lib64"; + $cuda_library = "-L/opt/cuda/lib64"; } $arch_lex = "flex"; # has problems with vendor lex +@@ -2605,13 +2599,13 @@ + + + if ($config_arch eq "LINUXARM64") { +- $tcl_libs = "-ltcl8.5"; +- if ($config_tk) { $tcl_libs = "-ltk8.5 -lX11 " . $tcl_libs; } ++ $tcl_libs = "-ltcl"; ++ if ($config_tk) { $tcl_libs = "-ltk -lX11 " . $tcl_libs; } + $vmd_libs = "$tcl_libs -lz"; + + $arch_nvcc = "/usr/local/cuda/bin/nvcc"; + $arch_nvccflags = "--ptxas-options=-v " . +- "-gencode arch=compute_30,code=compute_30 " . ++ "-gencode arch=compute_35,code=compute_35 " . + "-gencode arch=compute_70,code=compute_70 " . + "--ftz=true "; + $cuda_library = "-L/usr/local/cuda/lib64"; +@@ -2635,11 +2629,7 @@ + + $arch_nvcc = "/usr/local/cuda-5.5/bin/nvcc"; + $arch_nvccflags = "--ptxas-options=-v " . +- "-gencode arch=compute_20,code=sm_20 " . +- "-gencode arch=compute_30,code=sm_30 " . + "-gencode arch=compute_35,code=sm_35 " . +- "-gencode arch=compute_20,code=compute_20 " . +- "-gencode arch=compute_30,code=compute_30 " . + "--ftz=true "; + $cuda_library = "-L/usr/local/cuda/lib"; + $arch_cc = "cc"; diff --git a/cuda.patch b/cuda.patch deleted file mode 100644 index 770360ec8d94..000000000000 --- a/cuda.patch +++ /dev/null @@ -1,426 +0,0 @@ ---- src/CUDAMarchingCubes.cu 2018-03-30 18:52:25.467189457 +0300 -+++ src/CUDAMarchingCubes.cu 2018-03-30 18:52:02.387136244 +0300 -@@ -10,7 +10,7 @@ - * - * $RCSfile: CUDAMarchingCubes.cu,v $ - * $Author: johns $ $Locker: $ $State: Exp $ -- * $Revision: 1.30 $ $Date: 2016/11/28 03:04:58 $ -+ * $Revision: 1.32 $ $Date: 2018/02/15 05:15:02 $ - * - *************************************************************************** - * DESCRIPTION: -@@ -25,14 +25,17 @@ - // - // Description: This class computes an isosurface for a given density grid - // using a CUDA Marching Cubes (MC) alorithm. --// The implementation is based on the MC demo from the --// Nvidia GPU Computing SDK, but has been improved --// and extended. This implementation achieves higher --// performance by reducing the number of temporary memory --// buffers, reduces the number of scan calls by using vector --// integer types, and allows extraction of per-vertex normals --// optionally computes per-vertex colors if provided with a --// volumetric texture map. -+// -+// The implementation is loosely based on the MC demo from -+// the Nvidia GPU Computing SDK, but the design has been -+// improved and extended in several ways. -+// -+// This implementation achieves higher performance -+// by reducing the number of temporary memory -+// buffers, reduces the number of scan calls by using -+// vector integer types, and allows extraction of -+// per-vertex normals and optionally computes -+// per-vertex colors if a volumetric texture map is provided. - // - // Author: Michael Krone <michael.krone@visus.uni-stuttgart.de> - // John Stone <johns@ks.uiuc.edu> -@@ -48,7 +51,7 @@ - #include <thrust/functional.h> - - // --// Restrict macro to make it easy to do perf tuning tess -+// Restrict macro to make it easy to do perf tuning tests - // - #if 0 - #define RESTRICT __restrict__ -@@ -171,6 +174,11 @@ - texture<float, 3, cudaReadModeElementType> volumeTex; - - // sample volume data set at a point p, p CAN NEVER BE OUT OF BOUNDS -+// XXX The sampleVolume() call underperforms vs. peak memory bandwidth -+// because we don't strictly enforce coalescing requirements in the -+// layout of the input volume presently. If we forced X/Y dims to be -+// warp-multiple it would become possible to use wider fetches and -+// a few other tricks to improve global memory bandwidth - __device__ float sampleVolume(const float * RESTRICT data, - uint3 p, uint3 gridSize) { - return data[(p.z*gridSize.x*gridSize.y) + (p.y*gridSize.x) + p.x]; -@@ -592,6 +600,30 @@ - cudaBindTextureToArray(volumeTex, d_vol, desc); - } - -+#if CUDART_VERSION >= 9000 -+// -+// XXX CUDA 9.0RC breaks the usability of Thrust scan() prefix sums when -+// used with the built-in uint2 vector integer types. To workaround -+// the problem we have to define our own type and associated conversion -+// routines etc. -+// -+ -+// XXX workaround for uint2 breakage in CUDA 9.0RC -+struct myuint2 : uint2 { -+ __host__ __device__ myuint2() : uint2(make_uint2(0, 0)) {} -+ __host__ __device__ myuint2(int val) : uint2(make_uint2(val, val)) {} -+ __host__ __device__ myuint2(uint2 val) : uint2(make_uint2(val.x, val.y)) {} -+}; -+ -+void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) { -+ const uint2 zero = make_uint2(0, 0); -+ thrust::exclusive_scan(thrust::device_ptr<myuint2>((myuint2*)input), -+ thrust::device_ptr<myuint2>((myuint2*)input + numElements), -+ thrust::device_ptr<myuint2>((myuint2*)output), -+ (myuint2) zero); -+} -+ -+#else - - void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) { - const uint2 zero = make_uint2(0, 0); -@@ -601,6 +633,7 @@ - zero); - } - -+#endif - - void ThrustScanWrapperArea(float* output, float* input, unsigned int numElements) { - thrust::inclusive_scan(thrust::device_ptr<float>(input), -@@ -639,11 +672,9 @@ - } - - --/////////////////////////////////////////////////////////////////////////////// - // - // class CUDAMarchingCubes - // --/////////////////////////////////////////////////////////////////////////////// - - CUDAMarchingCubes::CUDAMarchingCubes() { - // initialize values -@@ -713,9 +744,6 @@ - } - - --//////////////////////////////////////////////////////////////////////////////// --//! Run the Cuda part of the computation --//////////////////////////////////////////////////////////////////////////////// - void CUDAMarchingCubes::computeIsosurfaceVerts(float3* vertOut, unsigned int maxverts, dim3 & grid3) { - // check if data is available - if (!this->setdata) - ---- src/CUDAMDFF.cu 2016-12-01 10:11:56.000000000 +0300 -+++ src/CUDAMDFF.cu 2018-03-30 18:56:44.352937599 +0300 -@@ -11,7 +11,7 @@ - * - * $RCSfile: CUDAMDFF.cu,v $ - * $Author: johns $ $Locker: $ $State: Exp $ -- * $Revision: 1.75 $ $Date: 2015/04/07 20:41:26 $ -+ * $Revision: 1.78 $ $Date: 2018/02/19 07:10:37 $ - * - *************************************************************************** - * DESCRIPTION: -@@ -28,12 +28,16 @@ - #include <stdlib.h> - #include <string.h> - #include <cuda.h> --#include <float.h> // FLT_MAX etc -- -+#if CUDART_VERSION >= 9000 -+#include <cuda_fp16.h> // need to explicitly include for CUDA 9.0 -+#endif - #if CUDART_VERSION < 4000 - #error The VMD MDFF feature requires CUDA 4.0 or later - #endif - -+#include <float.h> // FLT_MAX etc -+ -+ - #include "Inform.h" - #include "utilities.h" - #include "WKFThreads.h" -@@ -588,6 +592,43 @@ - } - - -+ -+// #define VMDUSESHUFFLE 1 -+#if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000 -+// New warp shuffle-based CC sum reduction for Kepler and later GPUs. -+inline __device__ void cc_sumreduction(int tid, int totaltb, -+ float4 &total_cc_sums, -+ float &total_lcc, -+ int &total_lsize, -+ float4 *tb_cc_sums, -+ float *tb_lcc, -+ int *tb_lsize) { -+ total_cc_sums = make_float4(0.0f, 0.0f, 0.0f, 0.0f); -+ total_lcc = 0.0f; -+ total_lsize = 0; -+ -+ // use precisely one warp to do the final reduction -+ if (tid < warpSize) { -+ for (int i=tid; i<totaltb; i+=warpSize) { -+ total_cc_sums += tb_cc_sums[i]; -+ total_lcc += tb_lcc[i]; -+ total_lsize += tb_lsize[i]; -+ } -+ -+ // perform intra-warp parallel reduction... -+ // general loop version of parallel sum-reduction -+ for (int mask=warpSize/2; mask>0; mask>>=1) { -+ total_cc_sums.x += __shfl_xor_sync(0xffffffff, total_cc_sums.x, mask); -+ total_cc_sums.y += __shfl_xor_sync(0xffffffff, total_cc_sums.y, mask); -+ total_cc_sums.z += __shfl_xor_sync(0xffffffff, total_cc_sums.z, mask); -+ total_cc_sums.w += __shfl_xor_sync(0xffffffff, total_cc_sums.w, mask); -+ total_lcc += __shfl_xor_sync(0xffffffff, total_lcc, mask); -+ total_lsize += __shfl_xor_sync(0xffffffff, total_lsize, mask); -+ } -+ } -+} -+#else -+// shared memory based CC sum reduction - inline __device__ void cc_sumreduction(int tid, int totaltb, - float4 &total_cc_sums, - float &total_lcc, -@@ -629,6 +670,7 @@ - total_lcc = tb_lcc[0]; - total_lsize = tb_lsize[0]; - } -+#endif - - - inline __device__ void thread_cc_sum(float ref, float density, -@@ -750,6 +792,92 @@ - } - - -+#if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000 -+ // all threads write their local sums to shared memory... -+ __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ]; -+ __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ]; -+ __shared__ float tb_lcc_s[TOTALBLOCKSZ]; -+ __shared__ int tb_lsize_s[TOTALBLOCKSZ]; -+ -+ tb_cc_means_s[tid] = thread_cc_means; -+ tb_cc_squares_s[tid] = thread_cc_squares; -+ tb_lcc_s[tid] = thread_lcc; -+ tb_lsize_s[tid] = thread_lsize; -+ __syncthreads(); // all threads must hit syncthreads call... -+ -+ // use precisely one warp to do the thread-block-wide reduction -+ if (tid < warpSize) { -+ float2 tmp_cc_means = make_float2(0.0f, 0.0f); -+ float2 tmp_cc_squares = make_float2(0.0f, 0.0f); -+ float tmp_lcc = 0.0f; -+ int tmp_lsize = 0; -+ for (int i=tid; i<TOTALBLOCKSZ; i+=warpSize) { -+ tmp_cc_means += tb_cc_means_s[i]; -+ tmp_cc_squares += tb_cc_squares_s[i]; -+ tmp_lcc += tb_lcc_s[i]; -+ tmp_lsize += tb_lsize_s[i]; -+ } -+ -+ // perform intra-warp parallel reduction... -+ // general loop version of parallel sum-reduction -+ for (int mask=warpSize/2; mask>0; mask>>=1) { -+ tmp_cc_means.x += __shfl_xor_sync(0xffffffff, tmp_cc_means.x, mask); -+ tmp_cc_means.y += __shfl_xor_sync(0xffffffff, tmp_cc_means.y, mask); -+ tmp_cc_squares.x += __shfl_xor_sync(0xffffffff, tmp_cc_squares.x, mask); -+ tmp_cc_squares.y += __shfl_xor_sync(0xffffffff, tmp_cc_squares.y, mask); -+ tmp_lcc += __shfl_xor_sync(0xffffffff, tmp_lcc, mask); -+ tmp_lsize += __shfl_xor_sync(0xffffffff, tmp_lsize, mask); -+ } -+ -+ // write per-thread-block partial sums to global memory, -+ // if a per-thread-block CC output array is provided, write the -+ // local CC for this thread block out, and finally, check if we -+ // are the last thread block to finish, and finalize the overall -+ // CC results for the entire grid of thread blocks. -+ if (tid == 0) { -+ unsigned int bid = blockIdx.z * gridDim.x * gridDim.y + -+ blockIdx.y * gridDim.x + blockIdx.x; -+ -+ tb_cc_sums[bid] = make_float4(tmp_cc_means.x, tmp_cc_means.y, -+ tmp_cc_squares.x, tmp_cc_squares.y); -+ tb_lcc[bid] = tmp_lcc; -+ tb_lsize[bid] = tmp_lsize; -+ -+ if (tb_CC != NULL) { -+ float cc = calc_cc(tb_cc_means_s[0].x, tb_cc_means_s[0].y, -+ tb_cc_squares_s[0].x, tb_cc_squares_s[0].y, -+ tb_lsize_s[0], tb_lcc_s[0]); -+ -+ // write local per-thread-block CC to global memory -+ tb_CC[bid] = cc; -+ } -+ -+ __threadfence(); -+ -+ unsigned int value = atomicInc(&tbcatomic[0], totaltb); -+ isLastBlockDone = (value == (totaltb - 1)); -+ } -+ } -+ __syncthreads(); -+ -+ if (isLastBlockDone) { -+ float4 total_cc_sums; -+ float total_lcc; -+ int total_lsize; -+ cc_sumreduction(tid, totaltb, total_cc_sums, total_lcc, total_lsize, -+ tb_cc_sums, tb_lcc, tb_lsize); -+ -+ if (tid == 0) { -+ tb_cc_sums[totaltb] = total_cc_sums; -+ tb_lcc[totaltb] = total_lcc; -+ tb_lsize[totaltb] = total_lsize; -+ } -+ -+ reset_atomic_counter(&tbcatomic[0]); -+ } -+ -+#else -+ - // all threads write their local sums to shared memory... - __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ]; - __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ]; -@@ -794,6 +922,7 @@ - } - __syncthreads(); // all threads must hit syncthreads call... - } -+//#endif - - // write per-thread-block partial sums to global memory, - // if a per-thread-block CC output array is provided, write the -@@ -847,6 +976,7 @@ - } - #endif - } -+#endif - } - - - ---- src/CUDAQuickSurf.cu 2016-12-01 10:11:56.000000000 +0300 -+++ src/CUDAQuickSurf.cu 2018-03-30 19:01:38.777196233 +0300 -@@ -11,7 +11,7 @@ - * - * $RCSfile: CUDAQuickSurf.cu,v $ - * $Author: johns $ $Locker: $ $State: Exp $ -- * $Revision: 1.81 $ $Date: 2016/04/20 04:57:46 $ -+ * $Revision: 1.84 $ $Date: 2018/02/15 04:59:15 $ - * - *************************************************************************** - * DESCRIPTION: -@@ -22,6 +22,9 @@ - #include <stdlib.h> - #include <string.h> - #include <cuda.h> -+#if CUDART_VERSION >= 9000 -+#include <cuda_fp16.h> // need to explicitly include for CUDA 9.0 -+#endif - - #if CUDART_VERSION < 4000 - #error The VMD QuickSurf feature requires CUDA 4.0 or later -@@ -130,14 +133,14 @@ - #define GUNROLL 1 - #endif - --#if __CUDA_ARCH__ >= 300 - #define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ ) --#define MINBLOCKDENS 1 -+#if __CUDA_ARCH__ >= 600 -+#define MINBLOCKDENS 16 -+#elif __CUDA_ARCH__ >= 300 -+#define MINBLOCKDENS 16 - #elif __CUDA_ARCH__ >= 200 --#define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ ) - #define MINBLOCKDENS 1 - #else --#define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ ) - #define MINBLOCKDENS 1 - #endif - -@@ -150,7 +153,7 @@ - // - template<class DENSITY, class VOLTEX> - __global__ static void --// __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) -+__launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) - gaussdensity_fast_tex_norm(int natoms, - const float4 * RESTRICT sorted_xyzr, - const float4 * RESTRICT sorted_color, -@@ -217,6 +220,8 @@ - for (yab=yabmin; yab<=yabmax; yab++) { - for (xab=xabmin; xab<=xabmax; xab++) { - int abcellidx = zab * acplanesz + yab * acncells.x + xab; -+ // this biggest latency hotspot in the kernel, if we could improve -+ // packing of the grid cell map, we'd likely improve performance - uint2 atomstartend = cellStartEnd[abcellidx]; - if (atomstartend.x != GRID_CELL_EMPTY) { - unsigned int atomid; -@@ -296,7 +301,7 @@ - - - __global__ static void --// __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) -+__launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) - gaussdensity_fast_tex3f(int natoms, - const float4 * RESTRICT sorted_xyzr, - const float4 * RESTRICT sorted_color, -@@ -363,6 +368,8 @@ - for (yab=yabmin; yab<=yabmax; yab++) { - for (xab=xabmin; xab<=xabmax; xab++) { - int abcellidx = zab * acplanesz + yab * acncells.x + xab; -+ // this biggest latency hotspot in the kernel, if we could improve -+ // packing of the grid cell map, we'd likely improve performance - uint2 atomstartend = cellStartEnd[abcellidx]; - if (atomstartend.x != GRID_CELL_EMPTY) { - unsigned int atomid; -@@ -550,7 +557,6 @@ - - // per-GPU handle with various memory buffer pointers, etc. - typedef struct { -- /// max grid sizes and attributes the current allocations will support - int verbose; - long int natoms; - int colorperatom; -@@ -561,18 +567,18 @@ - int gy; - int gz; - -- CUDAMarchingCubes *mc; ///< Marching cubes class used to extract surface -+ CUDAMarchingCubes *mc; - -- float *devdensity; ///< density map stored in GPU memory -- void *devvoltexmap; ///< volumetric texture map -- float4 *xyzr_d; ///< atom coords and radii -- float4 *sorted_xyzr_d; ///< cell-sorted coords and radii -- float4 *color_d; ///< colors -- float4 *sorted_color_d; ///< cell-sorted colors -- -- unsigned int *atomIndex_d; ///< cell index for each atom -- unsigned int *atomHash_d; ///< -- uint2 *cellStartEnd_d; ///< cell start/end indices -+ float *devdensity; -+ void *devvoltexmap; -+ float4 *xyzr_d; -+ float4 *sorted_xyzr_d; -+ float4 *color_d; -+ float4 *sorted_color_d; -+ -+ unsigned int *atomIndex_d; -+ unsigned int *atomHash_d; -+ uint2 *cellStartEnd_d; - - void *safety; - float3 *v3f_d; diff --git a/mpi.patch b/mpi.patch deleted file mode 100644 index b3822a31f665..000000000000 --- a/mpi.patch +++ /dev/null @@ -1,26 +0,0 @@ ---- configure 2019-05-24 15:48:39.590316691 +0300 -+++ configure 2019-05-24 15:48:07.136424874 +0300 -@@ -970,15 +970,15 @@ - ####################### - $mpi_defines = "-DVMDMPI "; - ## Argonne MPICH --$mpi_dir = "/usr/lib64/mpich"; --$mpi_include = "-I/usr/include/mpich-x86_64"; --$mpi_library = "-L$mpi_dir/lib"; --$mpi_libs = "-lmpich"; -+#$mpi_dir = "/usr"; -+#$mpi_include = "-I/usr/include"; -+#$mpi_library = "-L$mpi_dir/lib"; -+#$mpi_libs = "-lmpicxx -lmpi"; - ## OpenMPI --# $mpi_dir = "/usr/lib64/openmpi"; --# $mpi_include = "-I/usr/include/openmpi-x86_64"; --# $mpi_library = "-L$mpi_dir/lib"; --# $mpi_libs = "-lmpi"; -+$mpi_dir = "/usr"; -+$mpi_include = "-I/usr/include"; -+$mpi_library = "-L$mpi_dir/lib/openmpi"; -+$mpi_libs = "-lmpi_cxx -lmpi"; - ## OSU MVAPICH - # $mpi_dir = "/usr/mpi/gcc/mvapich-1.0.0"; - # $mpi_include = "-I$mpi_dir/include"; diff --git a/ospray.patch b/ospray.patch deleted file mode 100644 index f825563d8f46..000000000000 --- a/ospray.patch +++ /dev/null @@ -1,53 +0,0 @@ ---- src/OSPRayRenderer.C 2016-12-01 10:11:51.000000000 +0300 -+++ src/OSPRayRenderer.C 2018-03-30 18:22:52.389958174 +0300 -@@ -11,7 +11,7 @@ - * - * $RCSfile: OSPRayRenderer.C - * $Author: johns $ $Locker: $ $State: Exp $ --* $Revision: 1.60 $ $Date: 2016/11/28 06:00:48 $ -+* $Revision: 1.61 $ $Date: 2017/01/10 13:57:50 $ - * - *************************************************************************** - * DESCRIPTION: -@@ -659,7 +659,7 @@ - if (ao_samples != 0) - lightscale = ao_direct; - -- for (int i = 0; i < directional_lights.num(); ++i) { -+ for (i = 0; i < directional_lights.num(); ++i) { - #if 1 - OSPLight light = ospNewLight(ospRenderer, "distant"); - #else -@@ -2021,7 +2021,7 @@ - ca.cylinders = (float *) calloc(1, cylnum * bytes_per_cylinder); - ca.colors = (float *) calloc(1, cylnum * 4 * sizeof(float)); - -- unsigned int i,ind4,ind6,ind7; -+ int i,ind4,ind6,ind7; - const int rOffset = 6; // radius offset - if (wtrans == NULL) { - for (i=0,ind4=0,ind6=0,ind7=0; i<cylnum; i++,ind4+=4,ind6+=6,ind7+=7) { -@@ -2469,10 +2469,7 @@ - int i, ind, ind9, ind12; - - const float ci2f = 1.0f / 255.0f; -- const float cn2f = 1.0f / 127.5f; - for (i=0,ind=0,ind9=0,ind12=0; i<numfacets; i++,ind+=3,ind9+=9,ind12+=12) { -- float norm[9]; -- - // transform to eye coordinates - wtrans.multpoint3d(v + ind9 , (float*) &mesh.v[ind9 ]); - wtrans.multpoint3d(v + ind9 + 3, (float*) &mesh.v[ind9 + 3]); -@@ -2627,12 +2624,6 @@ - // create and fill the OSPRay trimesh memory buffer - int i, ind, ind9, ind12; - -- const rgba c = { uniform_color[0], -- uniform_color[1], -- uniform_color[2], -- 1.0f /* mat_opacity*/ }; -- -- - for (i=0,ind=0,ind9=0,ind12=0; i<numfacets; i++,ind+=3,ind9+=9,ind12+=12) { - // transform to eye coordinates - wtrans.multpoint3d(v + ind9 , (float*) &mesh.v[ind9 ]); |