From 0d1c5b3911f502c3fe8a7d91b344a81921a2d822 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 24 Jun 2020 15:56:30 +0300 Subject: [PATCH] Autoformatted --- samples/bwtest/main.c | 47 ++++++++++++-------- src/core/device.cc | 21 ++++----- src/core/kernels/kernels.cu | 25 ++++++----- src/core/kernels/reductions.cuh | 5 ++- src/utils/modelsolver.c | 78 ++++++++++++++++++++++----------- 5 files changed, 108 insertions(+), 68 deletions(-) diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c index 9dd25d9..35d98d5 100644 --- a/samples/bwtest/main.c +++ b/samples/bwtest/main.c @@ -6,15 +6,21 @@ #include -#include #include // CUDA driver API +#include #include "timer_hires.h" // From src/common //#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes #define BLOCK_SIZE (256 * 256 * 3 * 8 * 8) -#define errchk(x) { if (!(x)) { fprintf(stderr, "errchk(%s) failed", #x); assert(x); }} +#define errchk(x) \ + { \ + if (!(x)) { \ + fprintf(stderr, "errchk(%s) failed", #x); \ + assert(x); \ + } \ + } /* Findings: @@ -57,17 +63,18 @@ allocDevice(const size_t bytes) static uint8_t* allocDevicePinned(const size_t bytes) { - #define USE_CUDA_DRIVER_PINNING (1) - #if USE_CUDA_DRIVER_PINNING +#define USE_CUDA_DRIVER_PINNING (1) +#if USE_CUDA_DRIVER_PINNING uint8_t* arr = allocDevice(bytes); unsigned int flag = 1; - CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)arr); + CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + (CUdeviceptr)arr); errchk(retval == CUDA_SUCCESS); return arr; - #else +#else uint8_t* arr; // Standard (20 GiB/s internode, 85 GiB/s intranode) // const cudaError_t retval = cudaMalloc((void**)&arr, bytes); @@ -77,7 +84,7 @@ allocDevicePinned(const size_t bytes) const cudaError_t retval = cudaMallocHost((void**)&arr, bytes); errchk(retval == cudaSuccess); return arr; - #endif +#endif } /* @@ -267,7 +274,6 @@ send_h2d(uint8_t* src, uint8_t* dst) cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice); } - static void sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst) { @@ -327,10 +333,10 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_ MPI_Barrier(MPI_COMM_WORLD); } - static void -measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst, - uint8_t* hsrc, uint8_t* ddst) +measurebw2(const char* msg, const size_t bytes, + void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst, + uint8_t* hsrc, uint8_t* ddst) { const size_t num_samples = 100; @@ -414,8 +420,8 @@ main(void) measurebw("Bidirectional bandwidth, twoway (Host)", // 2 * BLOCK_SIZE, sendrecv_twoway, src, dst); measurebw("Bidirectional bandwidth, async multiple (Host)", // - 2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst); - //measurebw("Bidirectional bandwidth, async multiple parallel (Host)", // + 2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst); + // measurebw("Bidirectional bandwidth, async multiple parallel (Host)", // // 2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst); freeHost(src); @@ -434,11 +440,12 @@ main(void) measurebw("Bidirectional bandwidth, twoway (Device)", // 2 * BLOCK_SIZE, sendrecv_twoway, src, dst); measurebw("Bidirectional bandwidth, async multiple (Device)", // - 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst); - //measurebw("Bidirectional bandwidth, async multiple parallel (Device)", // + 2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst); + // measurebw("Bidirectional bandwidth, async multiple parallel (Device)", // // 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst); measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", // - 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst); + 2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, + dst); freeDevice(src); freeDevice(dst); @@ -456,7 +463,7 @@ main(void) measurebw("Bidirectional bandwidth, twoway (Device, pinned)", // 2 * BLOCK_SIZE, sendrecv_twoway, src, dst); measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", // - 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst); + 2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst); freeDevice(src); freeDevice(dst); @@ -472,7 +479,8 @@ main(void) measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst); measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst); - measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst); + measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, + ddst); freeDevice(dsrc); freeDevice(ddst); @@ -490,7 +498,8 @@ main(void) measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst); measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst); - measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst); + measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, + hsrc, ddst); freeDevice(dsrc); freeDevice(ddst); diff --git a/src/core/device.cc b/src/core/device.cc index 181e802..8cda677 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -16,7 +16,7 @@ #define MPI_COMPUTE_ENABLED (1) #define MPI_COMM_ENABLED (1) #define MPI_INCL_CORNERS (0) -#define MPI_USE_PINNED (1) // Do inter-node comm with pinned memory +#define MPI_USE_PINNED (1) // Do inter-node comm with pinned memory #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost #include // CUDA driver API (needed if MPI_USE_CUDA_DRIVER_PINNING is set) @@ -656,17 +656,18 @@ acCreatePackedData(const int3 dims) const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES; ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes)); - #if MPI_USE_CUDA_DRIVER_PINNING - ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes)); +#if MPI_USE_CUDA_DRIVER_PINNING + ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes)); - unsigned int flag = 1; - CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)data.data_pinned); - ERRCHK_ALWAYS(retval == CUDA_SUCCESS); - #else + unsigned int flag = 1; + CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + (CUdeviceptr)data.data_pinned); + ERRCHK_ALWAYS(retval == CUDA_SUCCESS); +#else ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes)); - // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly - // slower than pinned (38 ms vs. 125 ms) - #endif // USE_CUDA_DRIVER_PINNING +// ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly +// slower than pinned (38 ms vs. 125 ms) +#endif // USE_CUDA_DRIVER_PINNING return data; } diff --git a/src/core/kernels/kernels.cu b/src/core/kernels/kernels.cu index 3c59486..2878f38 100644 --- a/src/core/kernels/kernels.cu +++ b/src/core/kernels/kernels.cu @@ -75,17 +75,20 @@ exp(const acComplex& val) { return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y)); } -static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b) +static __device__ inline acComplex +operator*(const AcReal& a, const acComplex& b) { return (acComplex){a * b.x, a * b.y}; } -static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a) +static __device__ inline acComplex +operator*(const acComplex& b, const AcReal& a) { return (acComplex){a * b.x, a * b.y}; } -static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b) +static __device__ inline acComplex +operator*(const acComplex& a, const acComplex& b) { return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x}; } @@ -116,7 +119,7 @@ acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcReal const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info; ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, - cudaMemcpyHostToDevice, device->streams[stream])); + cudaMemcpyHostToDevice, device->streams[stream])); return AC_SUCCESS; } @@ -141,7 +144,7 @@ acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info; ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, - cudaMemcpyHostToDevice, device->streams[stream])); + cudaMemcpyHostToDevice, device->streams[stream])); return AC_SUCCESS; } @@ -165,7 +168,7 @@ acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntPara const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info; ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, - cudaMemcpyHostToDevice, device->streams[stream])); + cudaMemcpyHostToDevice, device->streams[stream])); return AC_SUCCESS; } @@ -179,10 +182,10 @@ acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Pa } if (!is_valid(value.x) || !is_valid(value.y) || !is_valid(value.z)) { - fprintf( - stderr, - "WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. Skipping.\n", - value.x, value.y, value.z, int3param_names[param]); + fprintf(stderr, + "WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. " + "Skipping.\n", + value.x, value.y, value.z, int3param_names[param]); return AC_FAILURE; } @@ -229,7 +232,7 @@ acDeviceLoadDefaultUniforms(const Device device) { cudaSetDevice(device->id); - // clang-format off +// clang-format off // Scalar #define LOAD_DEFAULT_UNIFORM(X) acDeviceLoadScalarUniform(device, STREAM_DEFAULT, X, X##_DEFAULT_VALUE); AC_FOR_USER_REAL_PARAM_TYPES(LOAD_DEFAULT_UNIFORM) diff --git a/src/core/kernels/reductions.cuh b/src/core/kernels/reductions.cuh index 8877e7e..1f40df3 100644 --- a/src/core/kernels/reductions.cuh +++ b/src/core/kernels/reductions.cuh @@ -92,8 +92,9 @@ kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* sr assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); - dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter( - src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]); + dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)], + src1[IDX(src_idx)], + src2[IDX(src_idx)]); } template diff --git a/src/utils/modelsolver.c b/src/utils/modelsolver.c index d9446eb..4c2edcf 100644 --- a/src/utils/modelsolver.c +++ b/src/utils/modelsolver.c @@ -103,11 +103,16 @@ first_derivative(const Scalar* pencil, const Scalar inv_ds) #elif STENCIL_ORDER == 4 const Scalar coefficients[] = {0, (Scalar)(2.0 / 3.0), (Scalar)(-1.0 / 12.0)}; #elif STENCIL_ORDER == 6 - const Scalar coefficients[] = {0, (Scalar)(3.0 / 4.0), (Scalar)(-3.0 / 20.0), - (Scalar)(1.0 / 60.0)}; + const Scalar coefficients[] = { + 0, + (Scalar)(3.0 / 4.0), + (Scalar)(-3.0 / 20.0), + (Scalar)(1.0 / 60.0), + }; #elif STENCIL_ORDER == 8 - const Scalar coefficients[] = {0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0), - (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0)}; + const Scalar coefficients[] = { + 0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0), (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0), + }; #endif #define MID (STENCIL_ORDER / 2) @@ -126,15 +131,23 @@ second_derivative(const Scalar* pencil, const Scalar inv_ds) #if STENCIL_ORDER == 2 const Scalar coefficients[] = {-2, 1}; #elif STENCIL_ORDER == 4 - const Scalar coefficients[] = {(Scalar)(-5.0 / 2.0), (Scalar)(4.0 / 3.0), - (Scalar)(-1.0 / 12.0)}; + const Scalar coefficients[] = { + (Scalar)(-5.0 / 2.0), + (Scalar)(4.0 / 3.0), + (Scalar)(-1.0 / 12.0), + }; #elif STENCIL_ORDER == 6 - const Scalar coefficients[] = {(Scalar)(-49.0 / 18.0), (Scalar)(3.0 / 2.0), - (Scalar)(-3.0 / 20.0), (Scalar)(1.0 / 90.0)}; + const Scalar coefficients[] = { + (Scalar)(-49.0 / 18.0), + (Scalar)(3.0 / 2.0), + (Scalar)(-3.0 / 20.0), + (Scalar)(1.0 / 90.0), + }; #elif STENCIL_ORDER == 8 - const Scalar coefficients[] = {(Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0), - (Scalar)(-1.0 / 5.0), (Scalar)(8.0 / 315.0), - (Scalar)(-1.0 / 560.0)}; + const Scalar coefficients[] = { + (Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0), (Scalar)(-1.0 / 5.0), + (Scalar)(8.0 / 315.0), (Scalar)(-1.0 / 560.0), + }; #endif #define MID (STENCIL_ORDER / 2) @@ -156,16 +169,27 @@ cross_derivative(const Scalar* pencil_a, const Scalar* pencil_b, const Scalar in const Scalar coefficients[] = {0, (Scalar)(1.0 / 4.0)}; #elif STENCIL_ORDER == 4 const Scalar coefficients[] = { - 0, (Scalar)(1.0 / 32.0), - (Scalar)(1.0 / 64.0)}; // TODO correct coefficients, these are just placeholders + 0, + (Scalar)(1.0 / 32.0), + (Scalar)(1.0 / 64.0), + }; // TODO correct coefficients, these are just placeholders #elif STENCIL_ORDER == 6 const Scalar fac = ((Scalar)(1. / 720.)); - const Scalar coefficients[] = {0 * fac, (Scalar)(270.0) * fac, (Scalar)(-27.0) * fac, - (Scalar)(2.0) * fac}; + const Scalar coefficients[] = { + 0 * fac, + (Scalar)(270.0) * fac, + (Scalar)(-27.0) * fac, + (Scalar)(2.0) * fac, + }; #elif STENCIL_ORDER == 8 const Scalar fac = ((Scalar)(1. / 20160.)); - const Scalar coefficients[] = {0 * fac, (Scalar)(8064.) * fac, (Scalar)(-1008.) * fac, - (Scalar)(128.) * fac, (Scalar)(-9.) * fac}; + const Scalar coefficients[] = { + 0 * fac, + (Scalar)(8064.) * fac, + (Scalar)(-1008.) * fac, + (Scalar)(128.) * fac, + (Scalar)(-9.) * fac, + }; #endif #define MID (STENCIL_ORDER / 2) @@ -207,14 +231,14 @@ derxy(const int i, const int j, const int k, const Scalar* arr) Scalar pencil_a[STENCIL_ORDER + 1]; //#pragma unroll for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) - pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + offset - STENCIL_ORDER / 2, - k)]; + pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, // + j + offset - STENCIL_ORDER / 2, k)]; Scalar pencil_b[STENCIL_ORDER + 1]; //#pragma unroll for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) - pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + STENCIL_ORDER / 2 - offset, - k)]; + pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, // + j + STENCIL_ORDER / 2 - offset, k)]; return cross_derivative(pencil_a, pencil_b, getReal(AC_inv_dsx), getReal(AC_inv_dsy)); } @@ -539,7 +563,8 @@ gradient_of_divergence(const VectorData vec) return (Vector){ hessian(vec.xdata).row[0][0] + hessian(vec.ydata).row[0][1] + hessian(vec.zdata).row[0][2], hessian(vec.xdata).row[1][0] + hessian(vec.ydata).row[1][1] + hessian(vec.zdata).row[1][2], - hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2]}; + hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2], + }; } // Takes uu gradients and returns S @@ -805,10 +830,11 @@ forcing(int3 globalVertexIdx, Scalar dt) getInt(AC_ny) * getReal(AC_dsy), getInt(AC_nz) * getReal(AC_dsz)}; // source (origin) (void)a; // WARNING: not used - Vector xx = (Vector){(globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx), - (globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy), - (globalVertexIdx.z - getInt(AC_nz_min)) * - getReal(AC_dsz)}; // sink (current index) + Vector xx = (Vector){ + (globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx), + (globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy), + (globalVertexIdx.z - getInt(AC_nz_min)) * getReal(AC_dsz), + }; // sink (current index) const Scalar cs2 = getReal(AC_cs2_sound); const Scalar cs = sqrt(cs2);