summarylogtreecommitdiffstats
path: root/cuda.patch
diff options
context:
space:
mode:
authorAnton Kudelin2020-01-23 15:28:45 +0300
committerAnton Kudelin2020-01-23 15:28:45 +0300
commitb76e49a197f9ca85917562e7126903efef435a8c (patch)
treec7ece2e247f0b878645e599aa0ed862ac39f5125 /cuda.patch
parentb5bc00e19b30d6afada54c62ed505af6851c1551 (diff)
downloadaur-b76e49a197f9ca85917562e7126903efef435a8c.tar.gz
Updated to the latest alpha
Diffstat (limited to 'cuda.patch')
-rw-r--r--cuda.patch426
1 files changed, 0 insertions, 426 deletions
diff --git a/cuda.patch b/cuda.patch
deleted file mode 100644
index 770360ec8d94..000000000000
--- a/cuda.patch
+++ /dev/null
@@ -1,426 +0,0 @@
---- src/CUDAMarchingCubes.cu 2018-03-30 18:52:25.467189457 +0300
-+++ src/CUDAMarchingCubes.cu 2018-03-30 18:52:02.387136244 +0300
-@@ -10,7 +10,7 @@
- *
- * $RCSfile: CUDAMarchingCubes.cu,v $
- * $Author: johns $ $Locker: $ $State: Exp $
-- * $Revision: 1.30 $ $Date: 2016/11/28 03:04:58 $
-+ * $Revision: 1.32 $ $Date: 2018/02/15 05:15:02 $
- *
- ***************************************************************************
- * DESCRIPTION:
-@@ -25,14 +25,17 @@
- //
- // Description: This class computes an isosurface for a given density grid
- // using a CUDA Marching Cubes (MC) alorithm.
--// The implementation is based on the MC demo from the
--// Nvidia GPU Computing SDK, but has been improved
--// and extended. This implementation achieves higher
--// performance by reducing the number of temporary memory
--// buffers, reduces the number of scan calls by using vector
--// integer types, and allows extraction of per-vertex normals
--// optionally computes per-vertex colors if provided with a
--// volumetric texture map.
-+//
-+// The implementation is loosely based on the MC demo from
-+// the Nvidia GPU Computing SDK, but the design has been
-+// improved and extended in several ways.
-+//
-+// This implementation achieves higher performance
-+// by reducing the number of temporary memory
-+// buffers, reduces the number of scan calls by using
-+// vector integer types, and allows extraction of
-+// per-vertex normals and optionally computes
-+// per-vertex colors if a volumetric texture map is provided.
- //
- // Author: Michael Krone <michael.krone@visus.uni-stuttgart.de>
- // John Stone <johns@ks.uiuc.edu>
-@@ -48,7 +51,7 @@
- #include <thrust/functional.h>
-
- //
--// Restrict macro to make it easy to do perf tuning tess
-+// Restrict macro to make it easy to do perf tuning tests
- //
- #if 0
- #define RESTRICT __restrict__
-@@ -171,6 +174,11 @@
- texture<float, 3, cudaReadModeElementType> volumeTex;
-
- // sample volume data set at a point p, p CAN NEVER BE OUT OF BOUNDS
-+// XXX The sampleVolume() call underperforms vs. peak memory bandwidth
-+// because we don't strictly enforce coalescing requirements in the
-+// layout of the input volume presently. If we forced X/Y dims to be
-+// warp-multiple it would become possible to use wider fetches and
-+// a few other tricks to improve global memory bandwidth
- __device__ float sampleVolume(const float * RESTRICT data,
- uint3 p, uint3 gridSize) {
- return data[(p.z*gridSize.x*gridSize.y) + (p.y*gridSize.x) + p.x];
-@@ -592,6 +600,30 @@
- cudaBindTextureToArray(volumeTex, d_vol, desc);
- }
-
-+#if CUDART_VERSION >= 9000
-+//
-+// XXX CUDA 9.0RC breaks the usability of Thrust scan() prefix sums when
-+// used with the built-in uint2 vector integer types. To workaround
-+// the problem we have to define our own type and associated conversion
-+// routines etc.
-+//
-+
-+// XXX workaround for uint2 breakage in CUDA 9.0RC
-+struct myuint2 : uint2 {
-+ __host__ __device__ myuint2() : uint2(make_uint2(0, 0)) {}
-+ __host__ __device__ myuint2(int val) : uint2(make_uint2(val, val)) {}
-+ __host__ __device__ myuint2(uint2 val) : uint2(make_uint2(val.x, val.y)) {}
-+};
-+
-+void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) {
-+ const uint2 zero = make_uint2(0, 0);
-+ thrust::exclusive_scan(thrust::device_ptr<myuint2>((myuint2*)input),
-+ thrust::device_ptr<myuint2>((myuint2*)input + numElements),
-+ thrust::device_ptr<myuint2>((myuint2*)output),
-+ (myuint2) zero);
-+}
-+
-+#else
-
- void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) {
- const uint2 zero = make_uint2(0, 0);
-@@ -601,6 +633,7 @@
- zero);
- }
-
-+#endif
-
- void ThrustScanWrapperArea(float* output, float* input, unsigned int numElements) {
- thrust::inclusive_scan(thrust::device_ptr<float>(input),
-@@ -639,11 +672,9 @@
- }
-
-
--///////////////////////////////////////////////////////////////////////////////
- //
- // class CUDAMarchingCubes
- //
--///////////////////////////////////////////////////////////////////////////////
-
- CUDAMarchingCubes::CUDAMarchingCubes() {
- // initialize values
-@@ -713,9 +744,6 @@
- }
-
-
--////////////////////////////////////////////////////////////////////////////////
--//! Run the Cuda part of the computation
--////////////////////////////////////////////////////////////////////////////////
- void CUDAMarchingCubes::computeIsosurfaceVerts(float3* vertOut, unsigned int maxverts, dim3 & grid3) {
- // check if data is available
- if (!this->setdata)
-
---- src/CUDAMDFF.cu 2016-12-01 10:11:56.000000000 +0300
-+++ src/CUDAMDFF.cu 2018-03-30 18:56:44.352937599 +0300
-@@ -11,7 +11,7 @@
- *
- * $RCSfile: CUDAMDFF.cu,v $
- * $Author: johns $ $Locker: $ $State: Exp $
-- * $Revision: 1.75 $ $Date: 2015/04/07 20:41:26 $
-+ * $Revision: 1.78 $ $Date: 2018/02/19 07:10:37 $
- *
- ***************************************************************************
- * DESCRIPTION:
-@@ -28,12 +28,16 @@
- #include <stdlib.h>
- #include <string.h>
- #include <cuda.h>
--#include <float.h> // FLT_MAX etc
--
-+#if CUDART_VERSION >= 9000
-+#include <cuda_fp16.h> // need to explicitly include for CUDA 9.0
-+#endif
- #if CUDART_VERSION < 4000
- #error The VMD MDFF feature requires CUDA 4.0 or later
- #endif
-
-+#include <float.h> // FLT_MAX etc
-+
-+
- #include "Inform.h"
- #include "utilities.h"
- #include "WKFThreads.h"
-@@ -588,6 +592,43 @@
- }
-
-
-+
-+// #define VMDUSESHUFFLE 1
-+#if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000
-+// New warp shuffle-based CC sum reduction for Kepler and later GPUs.
-+inline __device__ void cc_sumreduction(int tid, int totaltb,
-+ float4 &total_cc_sums,
-+ float &total_lcc,
-+ int &total_lsize,
-+ float4 *tb_cc_sums,
-+ float *tb_lcc,
-+ int *tb_lsize) {
-+ total_cc_sums = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-+ total_lcc = 0.0f;
-+ total_lsize = 0;
-+
-+ // use precisely one warp to do the final reduction
-+ if (tid < warpSize) {
-+ for (int i=tid; i<totaltb; i+=warpSize) {
-+ total_cc_sums += tb_cc_sums[i];
-+ total_lcc += tb_lcc[i];
-+ total_lsize += tb_lsize[i];
-+ }
-+
-+ // perform intra-warp parallel reduction...
-+ // general loop version of parallel sum-reduction
-+ for (int mask=warpSize/2; mask>0; mask>>=1) {
-+ total_cc_sums.x += __shfl_xor_sync(0xffffffff, total_cc_sums.x, mask);
-+ total_cc_sums.y += __shfl_xor_sync(0xffffffff, total_cc_sums.y, mask);
-+ total_cc_sums.z += __shfl_xor_sync(0xffffffff, total_cc_sums.z, mask);
-+ total_cc_sums.w += __shfl_xor_sync(0xffffffff, total_cc_sums.w, mask);
-+ total_lcc += __shfl_xor_sync(0xffffffff, total_lcc, mask);
-+ total_lsize += __shfl_xor_sync(0xffffffff, total_lsize, mask);
-+ }
-+ }
-+}
-+#else
-+// shared memory based CC sum reduction
- inline __device__ void cc_sumreduction(int tid, int totaltb,
- float4 &total_cc_sums,
- float &total_lcc,
-@@ -629,6 +670,7 @@
- total_lcc = tb_lcc[0];
- total_lsize = tb_lsize[0];
- }
-+#endif
-
-
- inline __device__ void thread_cc_sum(float ref, float density,
-@@ -750,6 +792,92 @@
- }
-
-
-+#if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000
-+ // all threads write their local sums to shared memory...
-+ __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ];
-+ __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ];
-+ __shared__ float tb_lcc_s[TOTALBLOCKSZ];
-+ __shared__ int tb_lsize_s[TOTALBLOCKSZ];
-+
-+ tb_cc_means_s[tid] = thread_cc_means;
-+ tb_cc_squares_s[tid] = thread_cc_squares;
-+ tb_lcc_s[tid] = thread_lcc;
-+ tb_lsize_s[tid] = thread_lsize;
-+ __syncthreads(); // all threads must hit syncthreads call...
-+
-+ // use precisely one warp to do the thread-block-wide reduction
-+ if (tid < warpSize) {
-+ float2 tmp_cc_means = make_float2(0.0f, 0.0f);
-+ float2 tmp_cc_squares = make_float2(0.0f, 0.0f);
-+ float tmp_lcc = 0.0f;
-+ int tmp_lsize = 0;
-+ for (int i=tid; i<TOTALBLOCKSZ; i+=warpSize) {
-+ tmp_cc_means += tb_cc_means_s[i];
-+ tmp_cc_squares += tb_cc_squares_s[i];
-+ tmp_lcc += tb_lcc_s[i];
-+ tmp_lsize += tb_lsize_s[i];
-+ }
-+
-+ // perform intra-warp parallel reduction...
-+ // general loop version of parallel sum-reduction
-+ for (int mask=warpSize/2; mask>0; mask>>=1) {
-+ tmp_cc_means.x += __shfl_xor_sync(0xffffffff, tmp_cc_means.x, mask);
-+ tmp_cc_means.y += __shfl_xor_sync(0xffffffff, tmp_cc_means.y, mask);
-+ tmp_cc_squares.x += __shfl_xor_sync(0xffffffff, tmp_cc_squares.x, mask);
-+ tmp_cc_squares.y += __shfl_xor_sync(0xffffffff, tmp_cc_squares.y, mask);
-+ tmp_lcc += __shfl_xor_sync(0xffffffff, tmp_lcc, mask);
-+ tmp_lsize += __shfl_xor_sync(0xffffffff, tmp_lsize, mask);
-+ }
-+
-+ // write per-thread-block partial sums to global memory,
-+ // if a per-thread-block CC output array is provided, write the
-+ // local CC for this thread block out, and finally, check if we
-+ // are the last thread block to finish, and finalize the overall
-+ // CC results for the entire grid of thread blocks.
-+ if (tid == 0) {
-+ unsigned int bid = blockIdx.z * gridDim.x * gridDim.y +
-+ blockIdx.y * gridDim.x + blockIdx.x;
-+
-+ tb_cc_sums[bid] = make_float4(tmp_cc_means.x, tmp_cc_means.y,
-+ tmp_cc_squares.x, tmp_cc_squares.y);
-+ tb_lcc[bid] = tmp_lcc;
-+ tb_lsize[bid] = tmp_lsize;
-+
-+ if (tb_CC != NULL) {
-+ float cc = calc_cc(tb_cc_means_s[0].x, tb_cc_means_s[0].y,
-+ tb_cc_squares_s[0].x, tb_cc_squares_s[0].y,
-+ tb_lsize_s[0], tb_lcc_s[0]);
-+
-+ // write local per-thread-block CC to global memory
-+ tb_CC[bid] = cc;
-+ }
-+
-+ __threadfence();
-+
-+ unsigned int value = atomicInc(&tbcatomic[0], totaltb);
-+ isLastBlockDone = (value == (totaltb - 1));
-+ }
-+ }
-+ __syncthreads();
-+
-+ if (isLastBlockDone) {
-+ float4 total_cc_sums;
-+ float total_lcc;
-+ int total_lsize;
-+ cc_sumreduction(tid, totaltb, total_cc_sums, total_lcc, total_lsize,
-+ tb_cc_sums, tb_lcc, tb_lsize);
-+
-+ if (tid == 0) {
-+ tb_cc_sums[totaltb] = total_cc_sums;
-+ tb_lcc[totaltb] = total_lcc;
-+ tb_lsize[totaltb] = total_lsize;
-+ }
-+
-+ reset_atomic_counter(&tbcatomic[0]);
-+ }
-+
-+#else
-+
- // all threads write their local sums to shared memory...
- __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ];
- __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ];
-@@ -794,6 +922,7 @@
- }
- __syncthreads(); // all threads must hit syncthreads call...
- }
-+//#endif
-
- // write per-thread-block partial sums to global memory,
- // if a per-thread-block CC output array is provided, write the
-@@ -847,6 +976,7 @@
- }
- #endif
- }
-+#endif
- }
-
-
-
---- src/CUDAQuickSurf.cu 2016-12-01 10:11:56.000000000 +0300
-+++ src/CUDAQuickSurf.cu 2018-03-30 19:01:38.777196233 +0300
-@@ -11,7 +11,7 @@
- *
- * $RCSfile: CUDAQuickSurf.cu,v $
- * $Author: johns $ $Locker: $ $State: Exp $
-- * $Revision: 1.81 $ $Date: 2016/04/20 04:57:46 $
-+ * $Revision: 1.84 $ $Date: 2018/02/15 04:59:15 $
- *
- ***************************************************************************
- * DESCRIPTION:
-@@ -22,6 +22,9 @@
- #include <stdlib.h>
- #include <string.h>
- #include <cuda.h>
-+#if CUDART_VERSION >= 9000
-+#include <cuda_fp16.h> // need to explicitly include for CUDA 9.0
-+#endif
-
- #if CUDART_VERSION < 4000
- #error The VMD QuickSurf feature requires CUDA 4.0 or later
-@@ -130,14 +133,14 @@
- #define GUNROLL 1
- #endif
-
--#if __CUDA_ARCH__ >= 300
- #define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
--#define MINBLOCKDENS 1
-+#if __CUDA_ARCH__ >= 600
-+#define MINBLOCKDENS 16
-+#elif __CUDA_ARCH__ >= 300
-+#define MINBLOCKDENS 16
- #elif __CUDA_ARCH__ >= 200
--#define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
- #define MINBLOCKDENS 1
- #else
--#define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
- #define MINBLOCKDENS 1
- #endif
-
-@@ -150,7 +153,7 @@
- //
- template<class DENSITY, class VOLTEX>
- __global__ static void
--// __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
-+__launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
- gaussdensity_fast_tex_norm(int natoms,
- const float4 * RESTRICT sorted_xyzr,
- const float4 * RESTRICT sorted_color,
-@@ -217,6 +220,8 @@
- for (yab=yabmin; yab<=yabmax; yab++) {
- for (xab=xabmin; xab<=xabmax; xab++) {
- int abcellidx = zab * acplanesz + yab * acncells.x + xab;
-+ // this biggest latency hotspot in the kernel, if we could improve
-+ // packing of the grid cell map, we'd likely improve performance
- uint2 atomstartend = cellStartEnd[abcellidx];
- if (atomstartend.x != GRID_CELL_EMPTY) {
- unsigned int atomid;
-@@ -296,7 +301,7 @@
-
-
- __global__ static void
--// __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
-+__launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
- gaussdensity_fast_tex3f(int natoms,
- const float4 * RESTRICT sorted_xyzr,
- const float4 * RESTRICT sorted_color,
-@@ -363,6 +368,8 @@
- for (yab=yabmin; yab<=yabmax; yab++) {
- for (xab=xabmin; xab<=xabmax; xab++) {
- int abcellidx = zab * acplanesz + yab * acncells.x + xab;
-+ // this biggest latency hotspot in the kernel, if we could improve
-+ // packing of the grid cell map, we'd likely improve performance
- uint2 atomstartend = cellStartEnd[abcellidx];
- if (atomstartend.x != GRID_CELL_EMPTY) {
- unsigned int atomid;
-@@ -550,7 +557,6 @@
-
- // per-GPU handle with various memory buffer pointers, etc.
- typedef struct {
-- /// max grid sizes and attributes the current allocations will support
- int verbose;
- long int natoms;
- int colorperatom;
-@@ -561,18 +567,18 @@
- int gy;
- int gz;
-
-- CUDAMarchingCubes *mc; ///< Marching cubes class used to extract surface
-+ CUDAMarchingCubes *mc;
-
-- float *devdensity; ///< density map stored in GPU memory
-- void *devvoltexmap; ///< volumetric texture map
-- float4 *xyzr_d; ///< atom coords and radii
-- float4 *sorted_xyzr_d; ///< cell-sorted coords and radii
-- float4 *color_d; ///< colors
-- float4 *sorted_color_d; ///< cell-sorted colors
--
-- unsigned int *atomIndex_d; ///< cell index for each atom
-- unsigned int *atomHash_d; ///<
-- uint2 *cellStartEnd_d; ///< cell start/end indices
-+ float *devdensity;
-+ void *devvoltexmap;
-+ float4 *xyzr_d;
-+ float4 *sorted_xyzr_d;
-+ float4 *color_d;
-+ float4 *sorted_color_d;
-+
-+ unsigned int *atomIndex_d;
-+ unsigned int *atomHash_d;
-+ uint2 *cellStartEnd_d;
-
- void *safety;
- float3 *v3f_d;