Fixed various compilation warnings
This commit is contained in:
@@ -13,6 +13,8 @@
|
|||||||
//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
|
//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
|
||||||
#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
|
#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
|
||||||
|
|
||||||
|
#define errchk(x) { if (!(x)) { fprintf(stderr, "errchk(%s) failed", #x); assert(x); }}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Findings:
|
Findings:
|
||||||
- MUST ALWAYS SET DEVICE. Absolutely kills performance if device is not set explicitly
|
- MUST ALWAYS SET DEVICE. Absolutely kills performance if device is not set explicitly
|
||||||
@@ -27,7 +29,7 @@ static uint8_t*
|
|||||||
allocHost(const size_t bytes)
|
allocHost(const size_t bytes)
|
||||||
{
|
{
|
||||||
uint8_t* arr = malloc(bytes);
|
uint8_t* arr = malloc(bytes);
|
||||||
assert(arr);
|
errchk(arr);
|
||||||
return arr;
|
return arr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -47,7 +49,7 @@ allocDevice(const size_t bytes)
|
|||||||
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
|
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
|
||||||
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
|
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
|
||||||
// const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
|
// const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
|
||||||
assert(retval == cudaSuccess);
|
errchk(retval == cudaSuccess);
|
||||||
return arr;
|
return arr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -61,7 +63,7 @@ allocDevicePinned(const size_t bytes)
|
|||||||
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
|
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
|
||||||
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
|
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
|
||||||
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
|
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
|
||||||
assert(retval == cudaSuccess);
|
errchk(retval == cudaSuccess);
|
||||||
return arr;
|
return arr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -147,6 +149,7 @@ sendrecv_nonblocking_multiple(uint8_t* src, uint8_t* dst)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
static void
|
static void
|
||||||
sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst)
|
sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst)
|
||||||
{
|
{
|
||||||
@@ -154,7 +157,7 @@ sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst)
|
|||||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
|
|
||||||
MPI_Request recv_requests[nprocs], send_requests[nprocs];
|
MPI_Request send_requests[nprocs];
|
||||||
for (int i = 1; i < nprocs; ++i) {
|
for (int i = 1; i < nprocs; ++i) {
|
||||||
int nfront = (pid + i) % nprocs;
|
int nfront = (pid + i) % nprocs;
|
||||||
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
|
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
|
||||||
@@ -180,6 +183,7 @@ sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst)
|
|||||||
MPI_Wait(&send_requests[i], &status);
|
MPI_Wait(&send_requests[i], &status);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
static void
|
static void
|
||||||
sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
|
sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
|
||||||
@@ -198,8 +202,6 @@ sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
|
|||||||
int devices_per_node = -1;
|
int devices_per_node = -1;
|
||||||
cudaGetDeviceCount(&devices_per_node);
|
cudaGetDeviceCount(&devices_per_node);
|
||||||
|
|
||||||
const int node_id = pid / devices_per_node;
|
|
||||||
|
|
||||||
MPI_Request recv_requests[nprocs], send_requests[nprocs];
|
MPI_Request recv_requests[nprocs], send_requests[nprocs];
|
||||||
for (int i = 1; i < nprocs; ++i) {
|
for (int i = 1; i < nprocs; ++i) {
|
||||||
int nfront = (pid + i) % nprocs;
|
int nfront = (pid + i) % nprocs;
|
||||||
@@ -226,20 +228,20 @@ sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
send_d2h(const uint8_t* src, uint8_t* dst)
|
send_d2h(uint8_t* src, uint8_t* dst)
|
||||||
{
|
{
|
||||||
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyDeviceToHost);
|
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyDeviceToHost);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
send_h2d(const uint8_t* src, uint8_t* dst)
|
send_h2d(uint8_t* src, uint8_t* dst)
|
||||||
{
|
{
|
||||||
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
|
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
sendrecv_d2h2d(const uint8_t* dsrc, uint8_t* hdst, const uint8_t* hsrc, uint8_t* ddst)
|
sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst)
|
||||||
{
|
{
|
||||||
cudaStream_t d2h, h2d;
|
cudaStream_t d2h, h2d;
|
||||||
cudaStreamCreate(&d2h);
|
cudaStreamCreate(&d2h);
|
||||||
@@ -299,8 +301,8 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_
|
|||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(const uint8_t*, uint8_t*, const uint8_t*, uint8_t*), const uint8_t* dsrc, uint8_t* hdst,
|
measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
|
||||||
const uint8_t* hsrc, uint8_t* ddst)
|
uint8_t* hsrc, uint8_t* ddst)
|
||||||
{
|
{
|
||||||
const size_t num_samples = 100;
|
const size_t num_samples = 100;
|
||||||
|
|
||||||
@@ -342,7 +344,7 @@ main(void)
|
|||||||
MPI_Init(NULL, NULL);
|
MPI_Init(NULL, NULL);
|
||||||
// int provided;
|
// int provided;
|
||||||
// MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
|
// MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
|
||||||
// assert(provided >= MPI_THREAD_MULTIPLE);
|
// errchk(provided >= MPI_THREAD_MULTIPLE);
|
||||||
|
|
||||||
// Disable stdout buffering
|
// Disable stdout buffering
|
||||||
setbuf(stdout, NULL);
|
setbuf(stdout, NULL);
|
||||||
@@ -350,7 +352,7 @@ main(void)
|
|||||||
int pid, nprocs;
|
int pid, nprocs;
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
assert(nprocs >= 2); // Require at least one neighbor
|
errchk(nprocs >= 2); // Require at least one neighbor
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
if (!pid) {
|
if (!pid) {
|
||||||
@@ -432,7 +434,7 @@ main(void)
|
|||||||
freeDevice(dst);
|
freeDevice(dst);
|
||||||
}
|
}
|
||||||
PRINT("\n------------------------\n");
|
PRINT("\n------------------------\n");
|
||||||
|
|
||||||
{
|
{
|
||||||
uint8_t* hsrc = allocHost(BLOCK_SIZE);
|
uint8_t* hsrc = allocHost(BLOCK_SIZE);
|
||||||
uint8_t* hdst = allocHost(BLOCK_SIZE);
|
uint8_t* hdst = allocHost(BLOCK_SIZE);
|
||||||
@@ -450,7 +452,7 @@ main(void)
|
|||||||
freeHost(hdst);
|
freeHost(hdst);
|
||||||
}
|
}
|
||||||
PRINT("\n------------------------\n");
|
PRINT("\n------------------------\n");
|
||||||
|
|
||||||
{
|
{
|
||||||
uint8_t* hsrc = allocHost(BLOCK_SIZE);
|
uint8_t* hsrc = allocHost(BLOCK_SIZE);
|
||||||
uint8_t* hdst = allocHost(BLOCK_SIZE);
|
uint8_t* hdst = allocHost(BLOCK_SIZE);
|
||||||
|
@@ -962,7 +962,7 @@ acSyncCommData(const CommData data)
|
|||||||
static int3
|
static int3
|
||||||
mod(const int3 a, const int3 n)
|
mod(const int3 a, const int3 n)
|
||||||
{
|
{
|
||||||
return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)};
|
return (int3){(int)mod(a.x, n.x), (int)mod(a.y, n.y), (int)mod(a.z, n.z)};
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@@ -1058,7 +1058,6 @@ acTransferCommData(const Device device, //
|
|||||||
const int3 dims = data->dims;
|
const int3 dims = data->dims;
|
||||||
const size_t blockcount = data->count;
|
const size_t blockcount = data->count;
|
||||||
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
|
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
|
||||||
const int3 nghost = (int3){NGHOST, NGHOST, NGHOST};
|
|
||||||
|
|
||||||
for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {
|
for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {
|
||||||
|
|
||||||
@@ -1286,7 +1285,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
|
|
||||||
const Device device = grid.device;
|
const Device device = grid.device;
|
||||||
const int3 nn = grid.nn;
|
const int3 nn = grid.nn;
|
||||||
CommData corner_data = grid.corner_data;
|
//CommData corner_data = grid.corner_data; // Do not rm: required for corners
|
||||||
CommData edgex_data = grid.edgex_data;
|
CommData edgex_data = grid.edgex_data;
|
||||||
CommData edgey_data = grid.edgey_data;
|
CommData edgey_data = grid.edgey_data;
|
||||||
CommData edgez_data = grid.edgez_data;
|
CommData edgez_data = grid.edgez_data;
|
||||||
@@ -1297,6 +1296,8 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
acDeviceSynchronizeStream(device, stream);
|
acDeviceSynchronizeStream(device, stream);
|
||||||
|
|
||||||
// Corners
|
// Corners
|
||||||
|
/*
|
||||||
|
// Do not rm: required for corners
|
||||||
const int3 corner_b0s[] = {
|
const int3 corner_b0s[] = {
|
||||||
(int3){0, 0, 0},
|
(int3){0, 0, 0},
|
||||||
(int3){NGHOST + nn.x, 0, 0},
|
(int3){NGHOST + nn.x, 0, 0},
|
||||||
@@ -1308,6 +1309,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
(int3){0, NGHOST + nn.y, NGHOST + nn.z},
|
(int3){0, NGHOST + nn.y, NGHOST + nn.z},
|
||||||
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
|
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
|
||||||
};
|
};
|
||||||
|
*/
|
||||||
|
|
||||||
// Edges X
|
// Edges X
|
||||||
const int3 edgex_b0s[] = {
|
const int3 edgex_b0s[] = {
|
||||||
@@ -1355,7 +1357,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
};
|
};
|
||||||
|
|
||||||
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
||||||
// acPackCommData(device, corner_b0s, &corner_data);
|
// acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
|
||||||
acPackCommData(device, edgex_b0s, &edgex_data);
|
acPackCommData(device, edgex_b0s, &edgex_data);
|
||||||
acPackCommData(device, edgey_b0s, &edgey_data);
|
acPackCommData(device, edgey_b0s, &edgey_data);
|
||||||
acPackCommData(device, edgez_b0s, &edgez_data);
|
acPackCommData(device, edgez_b0s, &edgez_data);
|
||||||
@@ -1363,18 +1365,10 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
acPackCommData(device, sidexz_b0s, &sidexz_data);
|
acPackCommData(device, sidexz_b0s, &sidexz_data);
|
||||||
acPackCommData(device, sideyz_b0s, &sideyz_data);
|
acPackCommData(device, sideyz_b0s, &sideyz_data);
|
||||||
|
|
||||||
//////////// INNER INTEGRATION //////////////
|
|
||||||
{
|
|
||||||
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
|
|
||||||
const int3 m2 = nn;
|
|
||||||
acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
|
|
||||||
}
|
|
||||||
////////////////////////////////////////////
|
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
|
||||||
#if MPI_GPUDIRECT_DISABLED
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
// acTransferCommDataToHost(device, &corner_data);
|
// acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners
|
||||||
acTransferCommDataToHost(device, &edgex_data);
|
acTransferCommDataToHost(device, &edgex_data);
|
||||||
acTransferCommDataToHost(device, &edgey_data);
|
acTransferCommDataToHost(device, &edgey_data);
|
||||||
acTransferCommDataToHost(device, &edgez_data);
|
acTransferCommDataToHost(device, &edgez_data);
|
||||||
@@ -1383,7 +1377,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
acTransferCommDataToHost(device, &sideyz_data);
|
acTransferCommDataToHost(device, &sideyz_data);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// acTransferCommData(device, corner_b0s, &corner_data);
|
// acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
|
||||||
acTransferCommData(device, edgex_b0s, &edgex_data);
|
acTransferCommData(device, edgex_b0s, &edgex_data);
|
||||||
acTransferCommData(device, edgey_b0s, &edgey_data);
|
acTransferCommData(device, edgey_b0s, &edgey_data);
|
||||||
acTransferCommData(device, edgez_b0s, &edgez_data);
|
acTransferCommData(device, edgez_b0s, &edgez_data);
|
||||||
@@ -1391,7 +1385,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
acTransferCommData(device, sidexz_b0s, &sidexz_data);
|
acTransferCommData(device, sidexz_b0s, &sidexz_data);
|
||||||
acTransferCommData(device, sideyz_b0s, &sideyz_data);
|
acTransferCommData(device, sideyz_b0s, &sideyz_data);
|
||||||
|
|
||||||
// acTransferCommDataWait(corner_data);
|
// acTransferCommDataWait(corner_data); // Do not rm: required for corners
|
||||||
acTransferCommDataWait(edgex_data);
|
acTransferCommDataWait(edgex_data);
|
||||||
acTransferCommDataWait(edgey_data);
|
acTransferCommDataWait(edgey_data);
|
||||||
acTransferCommDataWait(edgez_data);
|
acTransferCommDataWait(edgez_data);
|
||||||
@@ -1400,7 +1394,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
acTransferCommDataWait(sideyz_data);
|
acTransferCommDataWait(sideyz_data);
|
||||||
|
|
||||||
#if MPI_GPUDIRECT_DISABLED
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
// acTransferCommDataToDevice(device, &corner_data);
|
// acTransferCommDataToDevice(device, &corner_data); // Do not rm: required for corners
|
||||||
acTransferCommDataToDevice(device, &edgex_data);
|
acTransferCommDataToDevice(device, &edgex_data);
|
||||||
acTransferCommDataToDevice(device, &edgey_data);
|
acTransferCommDataToDevice(device, &edgey_data);
|
||||||
acTransferCommDataToDevice(device, &edgez_data);
|
acTransferCommDataToDevice(device, &edgez_data);
|
||||||
@@ -1409,7 +1403,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
acTransferCommDataToDevice(device, &sideyz_data);
|
acTransferCommDataToDevice(device, &sideyz_data);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// acUnpinCommData(device, &corner_data);
|
// acUnpinCommData(device, &corner_data); // Do not rm: required for corners
|
||||||
acUnpinCommData(device, &edgex_data);
|
acUnpinCommData(device, &edgex_data);
|
||||||
acUnpinCommData(device, &edgey_data);
|
acUnpinCommData(device, &edgey_data);
|
||||||
acUnpinCommData(device, &edgez_data);
|
acUnpinCommData(device, &edgez_data);
|
||||||
@@ -1427,7 +1421,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
//////////// OUTER INTEGRATION //////////////
|
//////////// OUTER INTEGRATION //////////////
|
||||||
|
|
||||||
// Wait for unpacking
|
// Wait for unpacking
|
||||||
// acSyncCommData(corner_data);
|
// acSyncCommData(corner_data); // Do not rm: required for corners
|
||||||
acSyncCommData(edgex_data);
|
acSyncCommData(edgex_data);
|
||||||
acSyncCommData(edgey_data);
|
acSyncCommData(edgey_data);
|
||||||
acSyncCommData(edgez_data);
|
acSyncCommData(edgez_data);
|
||||||
|
Reference in New Issue
Block a user