diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu index 2111b95..5ebf83f 100644 --- a/src/core/astaroth.cu +++ b/src/core/astaroth.cu @@ -158,16 +158,16 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice /* Here we decompose the host mesh and distribute it among the GPUs in the node. - + The host mesh is a huge contiguous block of data. Its dimensions are given by the global variable named "grid". A "grid" is decomposed into "subgrids", one for each GPU. Here we check which parts of the range s0...s1 maps to the memory space stored by some GPU, ranging d0...d1, and transfer the data if needed. - + The index mapping is inherently quite involved, but here's a picture which hopefully helps make sense out of all this. - + Grid |----num_vertices---| @@ -181,8 +181,8 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice ^ ^ d0 d1 - ^ ^ - db da + ^ ^ + db da */ for (int i = 0; i < num_devices; ++i) { @@ -294,7 +294,7 @@ acBoundcondStep(void) /* // ===MIIKKANOTE START========================================== -%JP: The old way for computing boundary conditions conflicts with the +%JP: The old way for computing boundary conditions conflicts with the way we have to do things with multiple GPUs. The older approach relied on unified memory, which represented the whole @@ -303,7 +303,7 @@ in its current state is more meant for quick prototyping when performance is not Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than when managing the memory explicitly. -In this new approach, I have simplified the multi- and single-GPU layers significantly. +In this new approach, I have simplified the multi- and single-GPU layers significantly. Quick rundown: New struct: Grid. There are two global variables, "grid" and "subgrid", which contain the extents of the whole simulation domain and the decomposed grids, respectively. @@ -314,7 +314,7 @@ Quick rundown: The whole simulation domain is decomposed with respect to the z dimension. For example, if the grid contains (nx, ny, nz) vertices, then the subgrids contain (nx, ny, nz / num_devices) vertices. - + An local index (i, j, k) in some subgrid can be mapped to the global grid with global idx = (i, j, k + device_id * subgrid.n.z) @@ -328,35 +328,35 @@ Changes required to this commented code block: laid out in device memory - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle of type "Device" which should be passed to single-GPU functions. In this file, all devices - are stored in a global array "devices[num_devices]". + are stored in a global array "devices[num_devices]". - Every single-GPU function is executed asynchronously by default such that we can optimize Astaroth by executing memory transactions concurrently with computation. Therefore a StreamType should be passed as a parameter to single-GPU functions. Refresher: CUDA function calls are non-blocking when a stream is explicitly passed - as a parameter and commands executing in different streams can be processed + as a parameter and commands executing in different streams can be processed in parallel/concurrently. Note on periodic boundaries (might be helpful when implementing other boundary conditions): - With multiple GPUs, periodic boundary conditions applied on indices ranging from - + With multiple GPUs, periodic boundary conditions applied on indices ranging from + (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2) on a single device are "local", in the sense that they can be computed without having to exchange data with neighboring GPUs. Special care is needed only for transferring the data to the fron and back plates outside this range. In the solution we use here, we solve the local boundaries first, and then just exchange the front and back plates - in a "ring", like so + in a "ring", like so device_id (n) <-> 0 <-> 1 <-> ... <-> n <-> (0) - + // ======MIIKKANOTE END========================================== <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially moved into device.cu, function boundCondStep() - In astaroth.cu, we use acBoundcondStep() + In astaroth.cu, we use acBoundcondStep() just to distribute the work and manage communication between GPUs. @@ -364,8 +364,8 @@ Note on periodic boundaries (might be helpful when implementing other boundary c exit(0); #else - - + + const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices); const int3 start = (int3){0, 0, device_id * depth}; @@ -378,8 +378,8 @@ Note on periodic boundaries (might be helpful when implementing other boundary c // TODO uses the default stream currently if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH wedge_boundconds(0, tpb, start, end, d_buffer); - } else { - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) + } else { + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) periodic_boundconds(0, tpb, start, end, d_buffer.in[i]); <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */ @@ -424,20 +424,52 @@ acIntegrate(const AcReal& dt) return AC_SUCCESS; } +static AcReal +simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n) +{ + AcReal res = results[0]; + for (int i = 1; i < n; ++i) { + if (rtype == RTYPE_MAX) { + res = max(res, results[i]); + } else if (rtype == RTYPE_MIN) { + res = min(res, results[i]); + } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { + res = sum(res, results[i]); + } else { + ERROR("Invalid rtype"); + } + } + + if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { + const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z); + res = sqrt(inv_n * res); + } + + return res; +} + AcReal acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle) { - // TODO - return 0; + AcReal results[num_devices]; + for (int i = 0; i < num_devices; ++i) { + reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]); + } + + return simple_final_reduce_scal(rtype, results, num_devices); } AcReal acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b, const VertexBufferHandle& c) { - // TODO - return 0; + AcReal results[num_devices]; + for (int i = 0; i < num_devices; ++i) { + reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]); + } + + return simple_final_reduce_scal(rtype, results, num_devices); } AcResult diff --git a/src/core/device.cu b/src/core/device.cu index 52adc6b..51f6db9 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -37,6 +37,7 @@ __constant__ AcMeshInfo d_mesh_info; #define DCONST_INT(X) (d_mesh_info.int_params[X]) #define DCONST_REAL(X) (d_mesh_info.real_params[X]) #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy)) +#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy)) #include "kernels/kernels.cuh" struct device_s { @@ -118,7 +119,7 @@ AcResult createDevice(const int id, const AcMeshInfo device_config, Device* device_handle) { cudaSetDevice(id); - cudaDeviceReset(); + cudaDeviceReset(); // Create Device struct device_s* device = (struct device_s*) malloc(sizeof(*device)); @@ -194,16 +195,40 @@ boundcondStep(const Device device, const StreamType stream_type, const int3& sta } AcResult -reduceScal(const Device device) +reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype, + const VertexBufferHandle vtxbuf_handle, AcReal* result) { cudaSetDevice(device->id); + + *result = reduce_scal(device->streams[stream_type], rtype, + device->local_config.int_params[AC_nx], + device->local_config.int_params[AC_ny], + device->local_config.int_params[AC_nz], + device->vba.in[vtxbuf_handle], + device->reduce_scratchpad, device->reduce_result); + return AC_SUCCESS; } AcResult -reduceVec(const Device device) +reduceVec(const Device device, const StreamType stream_type, + const ReductionType rtype, + const VertexBufferHandle vec0, + const VertexBufferHandle vec1, + const VertexBufferHandle vec2, + AcReal* result) { cudaSetDevice(device->id); + + *result = reduce_vec(device->streams[stream_type], rtype, + device->local_config.int_params[AC_nx], + device->local_config.int_params[AC_ny], + device->local_config.int_params[AC_nz], + device->vba.in[vec0], + device->vba.in[vec1], + device->vba.in[vec2], + device->reduce_scratchpad, device->reduce_result); + return AC_SUCCESS; } diff --git a/src/core/device.cuh b/src/core/device.cuh index 41db226..56070c0 100644 --- a/src/core/device.cuh +++ b/src/core/device.cuh @@ -51,10 +51,16 @@ AcResult boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end); /** */ -AcResult reduceScal(const Device device); +AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype, + const VertexBufferHandle vtxbuf_handle, AcReal* result); /** */ -AcResult reduceVec(const Device device); +AcResult reduceVec(const Device device, const StreamType stream_type, + const ReductionType rtype, + const VertexBufferHandle vec0, + const VertexBufferHandle vec1, + const VertexBufferHandle vec2, + AcReal* result); /** */ AcResult rkStep(const Device device, const StreamType stream_type, const int step_number, diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh index cc05cf4..f930bc8 100644 --- a/src/core/kernels/kernels.cuh +++ b/src/core/kernels/kernels.cuh @@ -754,7 +754,7 @@ randf(void) } AcResult -rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end, +rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end, const AcReal dt, VertexBufferArray* buffer) { const dim3 tpb(32, 1, 4); @@ -792,3 +792,294 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st ERRCHK_CUDA_KERNEL(); return AC_SUCCESS; } + + +////////////////REDUCE/////////////////////////// +#include "src/core/math_utils.h" // is_power_of_two + +// Function pointer definitions +typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&); +typedef AcReal (*ReduceInitialScalFunc)(const AcReal&); +typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&, + const AcReal&); + +// clang-format off +/* Comparison funcs */ +__device__ inline AcReal +_device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; } + +__device__ inline AcReal +_device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; } + +__device__ inline AcReal +_device_sum(const AcReal& a, const AcReal& b) { return a + b; } + +/* Function used to determine the values used during reduction */ +__device__ inline AcReal +_device_length_scal(const AcReal& a) { return AcReal(a); } + +__device__ inline AcReal +_device_squared_scal(const AcReal& a) { return (AcReal)(a*a); } + +__device__ inline AcReal +_device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); } + +__device__ inline AcReal +_device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); } + +__device__ inline AcReal +_device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); } + +__device__ inline AcReal +_device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); } +// clang-format on + +__device__ inline bool +oob(const int& i, const int& j, const int& k) +{ + if (i >= d_mesh_info.int_params[AC_nx] || + j >= d_mesh_info.int_params[AC_ny] || + k >= d_mesh_info.int_params[AC_nz]) + return true; + else + return false; +} + +template +__global__ void +_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst) +{ + const int i = threadIdx.x + blockIdx.x * blockDim.x; + const int j = threadIdx.y + blockIdx.y * blockDim.y; + const int k = threadIdx.z + blockIdx.z * blockDim.z; + + if (oob(i, j, k)) + return; + + const int src_idx = DEVICE_VTXBUF_IDX( + i + d_mesh_info.int_params[AC_nx_min], + j + d_mesh_info.int_params[AC_ny_min], + k + d_mesh_info.int_params[AC_nz_min]); + const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k); + + dst[dst_idx] = reduce_initial(src[src_idx]); +} + +template +__global__ void +_kernel_reduce_vec(const __restrict__ AcReal* src_a, + const __restrict__ AcReal* src_b, + const __restrict__ AcReal* src_c, AcReal* dst) +{ + const int i = threadIdx.x + blockIdx.x * blockDim.x; + const int j = threadIdx.y + blockIdx.y * blockDim.y; + const int k = threadIdx.z + blockIdx.z * blockDim.z; + + if (oob(i, j, k)) + return; + + const int src_idx = DEVICE_VTXBUF_IDX( + i + d_mesh_info.int_params[AC_nx_min], + j + d_mesh_info.int_params[AC_ny_min], + k + d_mesh_info.int_params[AC_nz_min]); + const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k); + + dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx], + src_c[src_idx]); +} + +/////////////////////////////////////////////////////////////////////////////// +#define BLOCK_SIZE (1024) +#define ELEMS_PER_THREAD (32) + +template +__global__ void +_kernel_reduce(AcReal* src, AcReal* result) +{ + const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD; + const int scratchpad_size = DCONST_INT(AC_nxyz); + + if (idx >= scratchpad_size) + return; + + __shared__ AcReal smem[BLOCK_SIZE]; + + AcReal tmp = src[idx]; + + for (int i = 1; i < ELEMS_PER_THREAD; ++i) { + const int src_idx = idx + i * BLOCK_SIZE; + if (src_idx >= scratchpad_size) { + // This check is for safety: if accessing uninitialized values + // beyond the mesh boundaries, we will immediately start seeing NANs + if (threadIdx.x < BLOCK_SIZE) + smem[threadIdx.x] = NAN; + else + break; + } + tmp = reduce(tmp, src[src_idx]); + } + + smem[threadIdx.x] = tmp; + __syncthreads(); + + int offset = BLOCK_SIZE / 2; + while (offset > 0) { + + if (threadIdx.x < offset) { + tmp = reduce(tmp, smem[threadIdx.x + offset]); + smem[threadIdx.x] = tmp; + } + offset /= 2; + __syncthreads(); + } + if (threadIdx.x == 0) + src[idx] = tmp; +} + +template +__global__ void +_kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result) +{ + const int scratchpad_size = DCONST_INT(AC_nxyz); + const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD; + AcReal tmp = src[idx]; + const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD; + for (int i = 1; idx + i * block_offset < scratchpad_size; ++i) + tmp = reduce(tmp, src[idx + i * block_offset]); + + *result = tmp; +} +////////////////////////////////////////////////////////////////////////////// + +AcReal +reduce_scal(const cudaStream_t stream, + const ReductionType& rtype, const int& nx, const int& ny, + const int& nz, const AcReal* vertex_buffer, + AcReal* reduce_scratchpad, AcReal* reduce_result) +{ + const dim3 tpb(32, 4, 1); + const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)), + int(ceil(AcReal(nz) / tpb.z))); + + const int scratchpad_size = nx * ny * nz; + const int bpg2 = (unsigned int)ceil(AcReal(scratchpad_size) / + AcReal(ELEMS_PER_THREAD * BLOCK_SIZE)); + + switch (rtype) { + case RTYPE_MAX: + _kernel_reduce_scal<_device_length_scal> + <<>>(vertex_buffer, reduce_scratchpad); + _kernel_reduce<_device_max> + <<>>(reduce_scratchpad, reduce_result); + _kernel_reduce_block<_device_max> + <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); + break; + case RTYPE_MIN: + _kernel_reduce_scal<_device_length_scal> + <<>>(vertex_buffer, reduce_scratchpad); + _kernel_reduce<_device_min> + <<>>(reduce_scratchpad, reduce_result); + _kernel_reduce_block<_device_min> + <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); + break; + case RTYPE_RMS: + _kernel_reduce_scal<_device_squared_scal> + <<>>(vertex_buffer, reduce_scratchpad); + _kernel_reduce<_device_sum> + <<>>(reduce_scratchpad, reduce_result); + _kernel_reduce_block<_device_sum> + <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); + break; + case RTYPE_RMS_EXP: + _kernel_reduce_scal<_device_exp_squared_scal> + <<>>(vertex_buffer, reduce_scratchpad); + _kernel_reduce<_device_sum> + <<>>(reduce_scratchpad, reduce_result); + _kernel_reduce_block<_device_sum> + <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); + break; + default: + ERROR("Unrecognized RTYPE"); + } + + AcReal result; + cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); + return result; +} + +AcReal +reduce_vec(const cudaStream_t stream, + const ReductionType& rtype, const int& nx, const int& ny, + const int& nz, const AcReal* vertex_buffer_a, + const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c, + AcReal* reduce_scratchpad, AcReal* reduce_result) +{ + const dim3 tpb(32, 4, 1); + const dim3 bpg(int(ceil(float(nx) / tpb.x)), + int(ceil(float(ny) / tpb.y)), + int(ceil(float(nz) / tpb.z))); + + const int scratchpad_size = nx * ny * nz; + const int bpg2 = (unsigned int)ceil(float(scratchpad_size) / + float(ELEMS_PER_THREAD * BLOCK_SIZE)); + + // "Features" of this quick & efficient reduction: + // Block size must be smaller than the computational domain size + // (otherwise we would have do some additional bounds checking in the + // second half of _kernel_reduce, which gets quite confusing) + // Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split + // the work without worrying too much about the array bounds. + ERRCHK_ALWAYS(BLOCK_SIZE <= scratchpad_size); + ERRCHK_ALWAYS(!(BLOCK_SIZE % 2)); + // NOTE! Also does not work properly with non-power of two mesh dimension + // Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can + // see that uninitialized smem values are used in the comparison + ERRCHK_ALWAYS(is_power_of_two(nx)); + ERRCHK_ALWAYS(is_power_of_two(ny)); + ERRCHK_ALWAYS(is_power_of_two(nz)); + + switch (rtype) { + case RTYPE_MAX: + _kernel_reduce_vec<_device_length_vec> + <<>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c, + reduce_scratchpad); + _kernel_reduce<_device_max> + <<>>(reduce_scratchpad, reduce_result); + _kernel_reduce_block<_device_max> + <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); + break; + case RTYPE_MIN: + _kernel_reduce_vec<_device_length_vec> + <<>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c, + reduce_scratchpad); + _kernel_reduce<_device_min> + <<>>(reduce_scratchpad, reduce_result); + _kernel_reduce_block<_device_min> + <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); + break; + case RTYPE_RMS: + _kernel_reduce_vec<_device_squared_vec> + <<>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c, + reduce_scratchpad); + _kernel_reduce<_device_sum> + <<>>(reduce_scratchpad, reduce_result); + _kernel_reduce_block<_device_sum> + <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); + break; + case RTYPE_RMS_EXP: + _kernel_reduce_vec<_device_exp_squared_vec> + <<>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c, + reduce_scratchpad); + _kernel_reduce<_device_sum> + <<>>(reduce_scratchpad, reduce_result); + _kernel_reduce_block<_device_sum> + <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); + break; + default: + ERROR("Unrecognized RTYPE"); + } + + AcReal result; + cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); + return result; +} diff --git a/src/standalone/autotest.cc b/src/standalone/autotest.cc index 52a2219..bdbd9e3 100644 --- a/src/standalone/autotest.cc +++ b/src/standalone/autotest.cc @@ -478,7 +478,7 @@ run_autotest(void) //const vec3i test_dims[] = {{32, 32, 32}}; int num_failures = 0; - /*for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) { + for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) { config.int_params[AC_nx] = test_dims[i].x; config.int_params[AC_ny] = test_dims[i].y; config.int_params[AC_nz] = test_dims[i].z; @@ -489,7 +489,7 @@ run_autotest(void) num_failures += check_reductions(config); fflush(stdout); - }*/ // TODO uncomment + } for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) { config.int_params[AC_nx] = test_dims[i].x;