Autoformatted

This commit is contained in:
jpekkila
2020-06-24 15:56:30 +03:00
parent 3c3b2a1885
commit 0d1c5b3911
5 changed files with 108 additions and 68 deletions

View File

@@ -6,15 +6,21 @@
#include <mpi.h> #include <mpi.h>
#include <cuda_runtime_api.h>
#include <cuda.h> // CUDA driver API #include <cuda.h> // CUDA driver API
#include <cuda_runtime_api.h>
#include "timer_hires.h" // From src/common #include "timer_hires.h" // From src/common
//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes //#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8) #define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
#define errchk(x) { if (!(x)) { fprintf(stderr, "errchk(%s) failed", #x); assert(x); }} #define errchk(x) \
{ \
if (!(x)) { \
fprintf(stderr, "errchk(%s) failed", #x); \
assert(x); \
} \
}
/* /*
Findings: Findings:
@@ -62,7 +68,8 @@ allocDevicePinned(const size_t bytes)
uint8_t* arr = allocDevice(bytes); uint8_t* arr = allocDevice(bytes);
unsigned int flag = 1; unsigned int flag = 1;
CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)arr); CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
(CUdeviceptr)arr);
errchk(retval == CUDA_SUCCESS); errchk(retval == CUDA_SUCCESS);
return arr; return arr;
@@ -267,7 +274,6 @@ send_h2d(uint8_t* src, uint8_t* dst)
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice); cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
} }
static void static void
sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst) sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst)
{ {
@@ -327,9 +333,9 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_
MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD);
} }
static void static void
measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst, measurebw2(const char* msg, const size_t bytes,
void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
uint8_t* hsrc, uint8_t* ddst) uint8_t* hsrc, uint8_t* ddst)
{ {
const size_t num_samples = 100; const size_t num_samples = 100;
@@ -438,7 +444,8 @@ main(void)
// measurebw("Bidirectional bandwidth, async multiple parallel (Device)", // // measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
// 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst); // 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", // measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst); 2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src,
dst);
freeDevice(src); freeDevice(src);
freeDevice(dst); freeDevice(dst);
@@ -472,7 +479,8 @@ main(void)
measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst); measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst); measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst); measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc,
ddst);
freeDevice(dsrc); freeDevice(dsrc);
freeDevice(ddst); freeDevice(ddst);
@@ -490,7 +498,8 @@ main(void)
measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst); measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst); measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst); measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst,
hsrc, ddst);
freeDevice(dsrc); freeDevice(dsrc);
freeDevice(ddst); freeDevice(ddst);

View File

@@ -660,7 +660,8 @@ acCreatePackedData(const int3 dims)
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes)); ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes));
unsigned int flag = 1; unsigned int flag = 1;
CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)data.data_pinned); CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
(CUdeviceptr)data.data_pinned);
ERRCHK_ALWAYS(retval == CUDA_SUCCESS); ERRCHK_ALWAYS(retval == CUDA_SUCCESS);
#else #else
ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes)); ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));

View File

@@ -75,17 +75,20 @@ exp(const acComplex& val)
{ {
return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y)); return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
} }
static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b) static __device__ inline acComplex
operator*(const AcReal& a, const acComplex& b)
{ {
return (acComplex){a * b.x, a * b.y}; return (acComplex){a * b.x, a * b.y};
} }
static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a) static __device__ inline acComplex
operator*(const acComplex& b, const AcReal& a)
{ {
return (acComplex){a * b.x, a * b.y}; return (acComplex){a * b.x, a * b.y};
} }
static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b) static __device__ inline acComplex
operator*(const acComplex& a, const acComplex& b)
{ {
return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x}; return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
} }
@@ -179,9 +182,9 @@ acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Pa
} }
if (!is_valid(value.x) || !is_valid(value.y) || !is_valid(value.z)) { if (!is_valid(value.x) || !is_valid(value.y) || !is_valid(value.z)) {
fprintf( fprintf(stderr,
stderr, "WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. "
"WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. Skipping.\n", "Skipping.\n",
value.x, value.y, value.z, int3param_names[param]); value.x, value.y, value.z, int3param_names[param]);
return AC_FAILURE; return AC_FAILURE;
} }

View File

@@ -92,8 +92,9 @@ kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* sr
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter( dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)],
src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]); src1[IDX(src_idx)],
src2[IDX(src_idx)]);
} }
template <ReduceFunc reduce> template <ReduceFunc reduce>

View File

@@ -103,11 +103,16 @@ first_derivative(const Scalar* pencil, const Scalar inv_ds)
#elif STENCIL_ORDER == 4 #elif STENCIL_ORDER == 4
const Scalar coefficients[] = {0, (Scalar)(2.0 / 3.0), (Scalar)(-1.0 / 12.0)}; const Scalar coefficients[] = {0, (Scalar)(2.0 / 3.0), (Scalar)(-1.0 / 12.0)};
#elif STENCIL_ORDER == 6 #elif STENCIL_ORDER == 6
const Scalar coefficients[] = {0, (Scalar)(3.0 / 4.0), (Scalar)(-3.0 / 20.0), const Scalar coefficients[] = {
(Scalar)(1.0 / 60.0)}; 0,
(Scalar)(3.0 / 4.0),
(Scalar)(-3.0 / 20.0),
(Scalar)(1.0 / 60.0),
};
#elif STENCIL_ORDER == 8 #elif STENCIL_ORDER == 8
const Scalar coefficients[] = {0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0), const Scalar coefficients[] = {
(Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0)}; 0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0), (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0),
};
#endif #endif
#define MID (STENCIL_ORDER / 2) #define MID (STENCIL_ORDER / 2)
@@ -126,15 +131,23 @@ second_derivative(const Scalar* pencil, const Scalar inv_ds)
#if STENCIL_ORDER == 2 #if STENCIL_ORDER == 2
const Scalar coefficients[] = {-2, 1}; const Scalar coefficients[] = {-2, 1};
#elif STENCIL_ORDER == 4 #elif STENCIL_ORDER == 4
const Scalar coefficients[] = {(Scalar)(-5.0 / 2.0), (Scalar)(4.0 / 3.0), const Scalar coefficients[] = {
(Scalar)(-1.0 / 12.0)}; (Scalar)(-5.0 / 2.0),
(Scalar)(4.0 / 3.0),
(Scalar)(-1.0 / 12.0),
};
#elif STENCIL_ORDER == 6 #elif STENCIL_ORDER == 6
const Scalar coefficients[] = {(Scalar)(-49.0 / 18.0), (Scalar)(3.0 / 2.0), const Scalar coefficients[] = {
(Scalar)(-3.0 / 20.0), (Scalar)(1.0 / 90.0)}; (Scalar)(-49.0 / 18.0),
(Scalar)(3.0 / 2.0),
(Scalar)(-3.0 / 20.0),
(Scalar)(1.0 / 90.0),
};
#elif STENCIL_ORDER == 8 #elif STENCIL_ORDER == 8
const Scalar coefficients[] = {(Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0), const Scalar coefficients[] = {
(Scalar)(-1.0 / 5.0), (Scalar)(8.0 / 315.0), (Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0), (Scalar)(-1.0 / 5.0),
(Scalar)(-1.0 / 560.0)}; (Scalar)(8.0 / 315.0), (Scalar)(-1.0 / 560.0),
};
#endif #endif
#define MID (STENCIL_ORDER / 2) #define MID (STENCIL_ORDER / 2)
@@ -156,16 +169,27 @@ cross_derivative(const Scalar* pencil_a, const Scalar* pencil_b, const Scalar in
const Scalar coefficients[] = {0, (Scalar)(1.0 / 4.0)}; const Scalar coefficients[] = {0, (Scalar)(1.0 / 4.0)};
#elif STENCIL_ORDER == 4 #elif STENCIL_ORDER == 4
const Scalar coefficients[] = { const Scalar coefficients[] = {
0, (Scalar)(1.0 / 32.0), 0,
(Scalar)(1.0 / 64.0)}; // TODO correct coefficients, these are just placeholders (Scalar)(1.0 / 32.0),
(Scalar)(1.0 / 64.0),
}; // TODO correct coefficients, these are just placeholders
#elif STENCIL_ORDER == 6 #elif STENCIL_ORDER == 6
const Scalar fac = ((Scalar)(1. / 720.)); const Scalar fac = ((Scalar)(1. / 720.));
const Scalar coefficients[] = {0 * fac, (Scalar)(270.0) * fac, (Scalar)(-27.0) * fac, const Scalar coefficients[] = {
(Scalar)(2.0) * fac}; 0 * fac,
(Scalar)(270.0) * fac,
(Scalar)(-27.0) * fac,
(Scalar)(2.0) * fac,
};
#elif STENCIL_ORDER == 8 #elif STENCIL_ORDER == 8
const Scalar fac = ((Scalar)(1. / 20160.)); const Scalar fac = ((Scalar)(1. / 20160.));
const Scalar coefficients[] = {0 * fac, (Scalar)(8064.) * fac, (Scalar)(-1008.) * fac, const Scalar coefficients[] = {
(Scalar)(128.) * fac, (Scalar)(-9.) * fac}; 0 * fac,
(Scalar)(8064.) * fac,
(Scalar)(-1008.) * fac,
(Scalar)(128.) * fac,
(Scalar)(-9.) * fac,
};
#endif #endif
#define MID (STENCIL_ORDER / 2) #define MID (STENCIL_ORDER / 2)
@@ -207,14 +231,14 @@ derxy(const int i, const int j, const int k, const Scalar* arr)
Scalar pencil_a[STENCIL_ORDER + 1]; Scalar pencil_a[STENCIL_ORDER + 1];
//#pragma unroll //#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + offset - STENCIL_ORDER / 2, pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
k)]; j + offset - STENCIL_ORDER / 2, k)];
Scalar pencil_b[STENCIL_ORDER + 1]; Scalar pencil_b[STENCIL_ORDER + 1];
//#pragma unroll //#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + STENCIL_ORDER / 2 - offset, pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
k)]; j + STENCIL_ORDER / 2 - offset, k)];
return cross_derivative(pencil_a, pencil_b, getReal(AC_inv_dsx), getReal(AC_inv_dsy)); return cross_derivative(pencil_a, pencil_b, getReal(AC_inv_dsx), getReal(AC_inv_dsy));
} }
@@ -539,7 +563,8 @@ gradient_of_divergence(const VectorData vec)
return (Vector){ return (Vector){
hessian(vec.xdata).row[0][0] + hessian(vec.ydata).row[0][1] + hessian(vec.zdata).row[0][2], hessian(vec.xdata).row[0][0] + hessian(vec.ydata).row[0][1] + hessian(vec.zdata).row[0][2],
hessian(vec.xdata).row[1][0] + hessian(vec.ydata).row[1][1] + hessian(vec.zdata).row[1][2], hessian(vec.xdata).row[1][0] + hessian(vec.ydata).row[1][1] + hessian(vec.zdata).row[1][2],
hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2]}; hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2],
};
} }
// Takes uu gradients and returns S // Takes uu gradients and returns S
@@ -805,10 +830,11 @@ forcing(int3 globalVertexIdx, Scalar dt)
getInt(AC_ny) * getReal(AC_dsy), getInt(AC_ny) * getReal(AC_dsy),
getInt(AC_nz) * getReal(AC_dsz)}; // source (origin) getInt(AC_nz) * getReal(AC_dsz)}; // source (origin)
(void)a; // WARNING: not used (void)a; // WARNING: not used
Vector xx = (Vector){(globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx), Vector xx = (Vector){
(globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
(globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy), (globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
(globalVertexIdx.z - getInt(AC_nz_min)) * (globalVertexIdx.z - getInt(AC_nz_min)) * getReal(AC_dsz),
getReal(AC_dsz)}; // sink (current index) }; // sink (current index)
const Scalar cs2 = getReal(AC_cs2_sound); const Scalar cs2 = getReal(AC_cs2_sound);
const Scalar cs = sqrt(cs2); const Scalar cs = sqrt(cs2);