diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c index ada1721..73f4387 100644 --- a/samples/bwtest/main.c +++ b/samples/bwtest/main.c @@ -13,6 +13,8 @@ //#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes #define BLOCK_SIZE (256 * 256 * 3 * 8 * 8) +#define errchk(x) { if (!(x)) { fprintf(stderr, "errchk(%s) failed", #x); assert(x); }} + /* Findings: - MUST ALWAYS SET DEVICE. Absolutely kills performance if device is not set explicitly @@ -27,7 +29,7 @@ static uint8_t* allocHost(const size_t bytes) { uint8_t* arr = malloc(bytes); - assert(arr); + errchk(arr); return arr; } @@ -47,7 +49,7 @@ allocDevice(const size_t bytes) // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal); // Pinned (40 GiB/s internode, 10 GiB/s intranode) // const cudaError_t retval = cudaMallocHost((void**)&arr, bytes); - assert(retval == cudaSuccess); + errchk(retval == cudaSuccess); return arr; } @@ -61,7 +63,7 @@ allocDevicePinned(const size_t bytes) // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal); // Pinned (40 GiB/s internode, 10 GiB/s intranode) const cudaError_t retval = cudaMallocHost((void**)&arr, bytes); - assert(retval == cudaSuccess); + errchk(retval == cudaSuccess); return arr; } @@ -147,6 +149,7 @@ sendrecv_nonblocking_multiple(uint8_t* src, uint8_t* dst) } } +/* static void sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst) { @@ -154,7 +157,7 @@ sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst) MPI_Comm_rank(MPI_COMM_WORLD, &pid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - MPI_Request recv_requests[nprocs], send_requests[nprocs]; + MPI_Request send_requests[nprocs]; for (int i = 1; i < nprocs; ++i) { int nfront = (pid + i) % nprocs; MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]); @@ -180,6 +183,7 @@ sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst) MPI_Wait(&send_requests[i], &status); } } +*/ static void sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst) @@ -198,8 +202,6 @@ sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst) int devices_per_node = -1; cudaGetDeviceCount(&devices_per_node); - const int node_id = pid / devices_per_node; - MPI_Request recv_requests[nprocs], send_requests[nprocs]; for (int i = 1; i < nprocs; ++i) { int nfront = (pid + i) % nprocs; @@ -226,20 +228,20 @@ sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst) } static void -send_d2h(const uint8_t* src, uint8_t* dst) +send_d2h(uint8_t* src, uint8_t* dst) { cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyDeviceToHost); } static void -send_h2d(const uint8_t* src, uint8_t* dst) +send_h2d(uint8_t* src, uint8_t* dst) { cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice); } static void -sendrecv_d2h2d(const uint8_t* dsrc, uint8_t* hdst, const uint8_t* hsrc, uint8_t* ddst) +sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst) { cudaStream_t d2h, h2d; cudaStreamCreate(&d2h); @@ -299,8 +301,8 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_ static void -measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(const uint8_t*, uint8_t*, const uint8_t*, uint8_t*), const uint8_t* dsrc, uint8_t* hdst, - const uint8_t* hsrc, uint8_t* ddst) +measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst, + uint8_t* hsrc, uint8_t* ddst) { const size_t num_samples = 100; @@ -342,7 +344,7 @@ main(void) MPI_Init(NULL, NULL); // int provided; // MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided); - // assert(provided >= MPI_THREAD_MULTIPLE); + // errchk(provided >= MPI_THREAD_MULTIPLE); // Disable stdout buffering setbuf(stdout, NULL); @@ -350,7 +352,7 @@ main(void) int pid, nprocs; MPI_Comm_rank(MPI_COMM_WORLD, &pid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - assert(nprocs >= 2); // Require at least one neighbor + errchk(nprocs >= 2); // Require at least one neighbor MPI_Barrier(MPI_COMM_WORLD); if (!pid) { @@ -432,7 +434,7 @@ main(void) freeDevice(dst); } PRINT("\n------------------------\n"); - + { uint8_t* hsrc = allocHost(BLOCK_SIZE); uint8_t* hdst = allocHost(BLOCK_SIZE); @@ -450,7 +452,7 @@ main(void) freeHost(hdst); } PRINT("\n------------------------\n"); - + { uint8_t* hsrc = allocHost(BLOCK_SIZE); uint8_t* hdst = allocHost(BLOCK_SIZE); diff --git a/src/core/device.cc b/src/core/device.cc index 949fd69..2b1e482 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -962,7 +962,7 @@ acSyncCommData(const CommData data) static int3 mod(const int3 a, const int3 n) { - return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)}; + return (int3){(int)mod(a.x, n.x), (int)mod(a.y, n.y), (int)mod(a.z, n.z)}; } static void @@ -1058,7 +1058,6 @@ acTransferCommData(const Device device, // const int3 dims = data->dims; const size_t blockcount = data->count; const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES; - const int3 nghost = (int3){NGHOST, NGHOST, NGHOST}; for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) { @@ -1286,7 +1285,7 @@ acGridIntegrate(const Stream stream, const AcReal dt) const Device device = grid.device; const int3 nn = grid.nn; - CommData corner_data = grid.corner_data; + //CommData corner_data = grid.corner_data; // Do not rm: required for corners CommData edgex_data = grid.edgex_data; CommData edgey_data = grid.edgey_data; CommData edgez_data = grid.edgez_data; @@ -1297,6 +1296,8 @@ acGridIntegrate(const Stream stream, const AcReal dt) acDeviceSynchronizeStream(device, stream); // Corners + /* + // Do not rm: required for corners const int3 corner_b0s[] = { (int3){0, 0, 0}, (int3){NGHOST + nn.x, 0, 0}, @@ -1308,6 +1309,7 @@ acGridIntegrate(const Stream stream, const AcReal dt) (int3){0, NGHOST + nn.y, NGHOST + nn.z}, (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, }; + */ // Edges X const int3 edgex_b0s[] = { @@ -1355,7 +1357,7 @@ acGridIntegrate(const Stream stream, const AcReal dt) }; for (int isubstep = 0; isubstep < 3; ++isubstep) { - // acPackCommData(device, corner_b0s, &corner_data); + // acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners acPackCommData(device, edgex_b0s, &edgex_data); acPackCommData(device, edgey_b0s, &edgey_data); acPackCommData(device, edgez_b0s, &edgez_data); @@ -1363,18 +1365,10 @@ acGridIntegrate(const Stream stream, const AcReal dt) acPackCommData(device, sidexz_b0s, &sidexz_data); acPackCommData(device, sideyz_b0s, &sideyz_data); - //////////// INNER INTEGRATION ////////////// - { - const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; - const int3 m2 = nn; - acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt); - } - //////////////////////////////////////////// - MPI_Barrier(MPI_COMM_WORLD); #if MPI_GPUDIRECT_DISABLED - // acTransferCommDataToHost(device, &corner_data); + // acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners acTransferCommDataToHost(device, &edgex_data); acTransferCommDataToHost(device, &edgey_data); acTransferCommDataToHost(device, &edgez_data); @@ -1383,7 +1377,7 @@ acGridIntegrate(const Stream stream, const AcReal dt) acTransferCommDataToHost(device, &sideyz_data); #endif - // acTransferCommData(device, corner_b0s, &corner_data); + // acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners acTransferCommData(device, edgex_b0s, &edgex_data); acTransferCommData(device, edgey_b0s, &edgey_data); acTransferCommData(device, edgez_b0s, &edgez_data); @@ -1391,7 +1385,7 @@ acGridIntegrate(const Stream stream, const AcReal dt) acTransferCommData(device, sidexz_b0s, &sidexz_data); acTransferCommData(device, sideyz_b0s, &sideyz_data); - // acTransferCommDataWait(corner_data); + // acTransferCommDataWait(corner_data); // Do not rm: required for corners acTransferCommDataWait(edgex_data); acTransferCommDataWait(edgey_data); acTransferCommDataWait(edgez_data); @@ -1400,7 +1394,7 @@ acGridIntegrate(const Stream stream, const AcReal dt) acTransferCommDataWait(sideyz_data); #if MPI_GPUDIRECT_DISABLED - // acTransferCommDataToDevice(device, &corner_data); + // acTransferCommDataToDevice(device, &corner_data); // Do not rm: required for corners acTransferCommDataToDevice(device, &edgex_data); acTransferCommDataToDevice(device, &edgey_data); acTransferCommDataToDevice(device, &edgez_data); @@ -1409,7 +1403,7 @@ acGridIntegrate(const Stream stream, const AcReal dt) acTransferCommDataToDevice(device, &sideyz_data); #endif - // acUnpinCommData(device, &corner_data); + // acUnpinCommData(device, &corner_data); // Do not rm: required for corners acUnpinCommData(device, &edgex_data); acUnpinCommData(device, &edgey_data); acUnpinCommData(device, &edgez_data); @@ -1427,7 +1421,7 @@ acGridIntegrate(const Stream stream, const AcReal dt) //////////// OUTER INTEGRATION ////////////// // Wait for unpacking - // acSyncCommData(corner_data); + // acSyncCommData(corner_data); // Do not rm: required for corners acSyncCommData(edgex_data); acSyncCommData(edgey_data); acSyncCommData(edgez_data);