More accurate timing when benchmarking MPI. Also made GPU-GPU communication the default. Current version of UCX is bugged, must export 'UCX_MEMTYPE_CACHE=n' to workaround memory errors when doing GPU-GPU comm
This commit is contained in:
@@ -10,6 +10,7 @@
|
|||||||
#include "kernels/kernels.h"
|
#include "kernels/kernels.h"
|
||||||
|
|
||||||
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
|
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
|
||||||
|
#define MPI_GPUDIRECT_DISABLED (0)
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
acDevicePrintInfo(const Device device)
|
acDevicePrintInfo(const Device device)
|
||||||
@@ -533,6 +534,7 @@ acDestroyPackedData(PackedData* data)
|
|||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
static PackedData
|
static PackedData
|
||||||
acCreatePackedDataHost(const int3 dims)
|
acCreatePackedDataHost(const int3 dims)
|
||||||
{
|
{
|
||||||
@@ -578,6 +580,7 @@ acDestroyPackedDataHost(PackedData* data)
|
|||||||
|
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
#endif // MPI_GPUDIRECT_DISABLED
|
||||||
|
|
||||||
// TODO: do with packed data
|
// TODO: do with packed data
|
||||||
static AcResult
|
static AcResult
|
||||||
@@ -739,111 +742,13 @@ acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
|||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
// Deprecated
|
|
||||||
static AcResult
|
|
||||||
acDeviceCommunicateBlocksMPI(const Device device, //
|
|
||||||
const int3* a0s, // Src idx inside comp. domain
|
|
||||||
const int3* b0s, // Dst idx inside bound zone
|
|
||||||
const size_t mapping_count, // Num a0s and b0s
|
|
||||||
const int3 dims) // Block size
|
|
||||||
{
|
|
||||||
cudaSetDevice(device->id);
|
|
||||||
acDeviceSynchronizeStream(device, STREAM_ALL);
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
|
|
||||||
MPI_Datatype datatype = MPI_FLOAT;
|
|
||||||
if (sizeof(AcReal) == 8)
|
|
||||||
datatype = MPI_DOUBLE;
|
|
||||||
|
|
||||||
int nprocs, pid;
|
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
|
||||||
const int3 decomp = decompose(nprocs);
|
|
||||||
|
|
||||||
const int3 nn = (int3){
|
|
||||||
device->local_config.int_params[AC_nx],
|
|
||||||
device->local_config.int_params[AC_ny],
|
|
||||||
device->local_config.int_params[AC_nz],
|
|
||||||
};
|
|
||||||
|
|
||||||
for (int k = -1; k <= 1; ++k) {
|
|
||||||
for (int j = -1; j <= 1; ++j) {
|
|
||||||
for (int i = -1; i <= 1; ++i) {
|
|
||||||
if (i == 0 && j == 0 && k == 0)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
for (size_t a_idx = 0; a_idx < mapping_count; ++a_idx) {
|
|
||||||
for (size_t b_idx = 0; b_idx < mapping_count; ++b_idx) {
|
|
||||||
const int3 neighbor = (int3){i, j, k};
|
|
||||||
|
|
||||||
const int3 a0 = a0s[a_idx];
|
|
||||||
// const int3 a1 = a0 + dims;
|
|
||||||
|
|
||||||
const int3 b0 = a0 - neighbor * nn;
|
|
||||||
// const int3 b1 = a1 - neighbor * nn;
|
|
||||||
|
|
||||||
if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
|
|
||||||
|
|
||||||
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
|
|
||||||
|
|
||||||
PackedData src = acCreatePackedData(dims);
|
|
||||||
PackedData dst = acCreatePackedData(dims);
|
|
||||||
|
|
||||||
const cudaStream_t stream = device->streams[STREAM_DEFAULT];
|
|
||||||
acKernelPackData(stream, device->vba, a0, src);
|
|
||||||
acDeviceSynchronizeStream(device, STREAM_DEFAULT);
|
|
||||||
|
|
||||||
// Host ////////////////////////////////////////////////
|
|
||||||
PackedData src_host = acCreatePackedDataHost(dims);
|
|
||||||
PackedData dst_host = acCreatePackedDataHost(dims);
|
|
||||||
acTransferPackedDataToHost(device, device->streams[STREAM_DEFAULT], src,
|
|
||||||
&src_host);
|
|
||||||
acDeviceSynchronizeStream(device, STREAM_ALL);
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
const int3 pid3d = getPid3D(pid, decomp);
|
|
||||||
MPI_Request send_req, recv_req;
|
|
||||||
MPI_Isend(src_host.data, count, datatype,
|
|
||||||
getPid(pid3d + neighbor, decomp), b_idx, MPI_COMM_WORLD,
|
|
||||||
&send_req);
|
|
||||||
MPI_Irecv(dst_host.data, count, datatype,
|
|
||||||
getPid(pid3d - neighbor, decomp), b_idx, MPI_COMM_WORLD,
|
|
||||||
&recv_req);
|
|
||||||
|
|
||||||
MPI_Wait(&send_req, MPI_STATUS_IGNORE);
|
|
||||||
MPI_Wait(&recv_req, MPI_STATUS_IGNORE);
|
|
||||||
|
|
||||||
// Host ////////////////////////////////////////////////
|
|
||||||
acTransferPackedDataToDevice(device, device->streams[STREAM_DEFAULT],
|
|
||||||
dst_host, &dst);
|
|
||||||
acDeviceSynchronizeStream(device, STREAM_ALL);
|
|
||||||
acDestroyPackedDataHost(&src_host);
|
|
||||||
acDestroyPackedDataHost(&dst_host);
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
acKernelUnpackData(stream, dst, b0, device->vba);
|
|
||||||
acDeviceSynchronizeStream(device, STREAM_DEFAULT);
|
|
||||||
|
|
||||||
acDestroyPackedData(&src);
|
|
||||||
acDestroyPackedData(&dst);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return AC_SUCCESS;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
PackedData* srcs;
|
PackedData* srcs;
|
||||||
PackedData* dsts;
|
PackedData* dsts;
|
||||||
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
PackedData* srcs_host;
|
PackedData* srcs_host;
|
||||||
PackedData* dsts_host;
|
PackedData* dsts_host;
|
||||||
|
#endif
|
||||||
int3 dims;
|
int3 dims;
|
||||||
size_t count;
|
size_t count;
|
||||||
|
|
||||||
@@ -859,12 +764,10 @@ acCreateCommData(const Device device, const int3 dims, const size_t count)
|
|||||||
|
|
||||||
CommData data = {};
|
CommData data = {};
|
||||||
|
|
||||||
data.srcs = (PackedData*)malloc(count * sizeof(PackedData));
|
data.srcs = (PackedData*)malloc(count * sizeof(PackedData));
|
||||||
data.dsts = (PackedData*)malloc(count * sizeof(PackedData));
|
data.dsts = (PackedData*)malloc(count * sizeof(PackedData));
|
||||||
data.srcs_host = (PackedData*)malloc(count * sizeof(PackedData));
|
data.dims = dims;
|
||||||
data.dsts_host = (PackedData*)malloc(count * sizeof(PackedData));
|
data.count = count;
|
||||||
data.dims = dims;
|
|
||||||
data.count = count;
|
|
||||||
|
|
||||||
data.streams = (cudaStream_t*)malloc(count * sizeof(cudaStream_t));
|
data.streams = (cudaStream_t*)malloc(count * sizeof(cudaStream_t));
|
||||||
data.send_reqs = (MPI_Request*)malloc(count * sizeof(MPI_Request));
|
data.send_reqs = (MPI_Request*)malloc(count * sizeof(MPI_Request));
|
||||||
@@ -872,16 +775,24 @@ acCreateCommData(const Device device, const int3 dims, const size_t count)
|
|||||||
|
|
||||||
ERRCHK_ALWAYS(data.srcs);
|
ERRCHK_ALWAYS(data.srcs);
|
||||||
ERRCHK_ALWAYS(data.dsts);
|
ERRCHK_ALWAYS(data.dsts);
|
||||||
ERRCHK_ALWAYS(data.srcs_host);
|
|
||||||
ERRCHK_ALWAYS(data.dsts_host);
|
|
||||||
ERRCHK_ALWAYS(data.send_reqs);
|
ERRCHK_ALWAYS(data.send_reqs);
|
||||||
ERRCHK_ALWAYS(data.recv_reqs);
|
ERRCHK_ALWAYS(data.recv_reqs);
|
||||||
|
|
||||||
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
|
data.srcs_host = (PackedData*)malloc(count * sizeof(PackedData));
|
||||||
|
data.dsts_host = (PackedData*)malloc(count * sizeof(PackedData));
|
||||||
|
ERRCHK_ALWAYS(data.srcs_host);
|
||||||
|
ERRCHK_ALWAYS(data.dsts_host);
|
||||||
|
#endif
|
||||||
|
|
||||||
for (size_t i = 0; i < count; ++i) {
|
for (size_t i = 0; i < count; ++i) {
|
||||||
data.srcs[i] = acCreatePackedData(dims);
|
data.srcs[i] = acCreatePackedData(dims);
|
||||||
data.dsts[i] = acCreatePackedData(dims);
|
data.dsts[i] = acCreatePackedData(dims);
|
||||||
|
|
||||||
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
data.srcs_host[i] = acCreatePackedDataHost(dims);
|
data.srcs_host[i] = acCreatePackedDataHost(dims);
|
||||||
data.dsts_host[i] = acCreatePackedDataHost(dims);
|
data.dsts_host[i] = acCreatePackedDataHost(dims);
|
||||||
|
#endif
|
||||||
|
|
||||||
cudaStreamCreate(&data.streams[i]);
|
cudaStreamCreate(&data.streams[i]);
|
||||||
}
|
}
|
||||||
@@ -897,16 +808,22 @@ acDestroyCommData(const Device device, CommData* data)
|
|||||||
for (size_t i = 0; i < data->count; ++i) {
|
for (size_t i = 0; i < data->count; ++i) {
|
||||||
acDestroyPackedData(&data->srcs[i]);
|
acDestroyPackedData(&data->srcs[i]);
|
||||||
acDestroyPackedData(&data->dsts[i]);
|
acDestroyPackedData(&data->dsts[i]);
|
||||||
|
|
||||||
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
acDestroyPackedDataHost(&data->srcs_host[i]);
|
acDestroyPackedDataHost(&data->srcs_host[i]);
|
||||||
acDestroyPackedDataHost(&data->dsts_host[i]);
|
acDestroyPackedDataHost(&data->dsts_host[i]);
|
||||||
|
#endif
|
||||||
|
|
||||||
cudaStreamDestroy(data->streams[i]);
|
cudaStreamDestroy(data->streams[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
free(data->srcs);
|
free(data->srcs);
|
||||||
free(data->dsts);
|
free(data->dsts);
|
||||||
|
|
||||||
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
free(data->srcs_host);
|
free(data->srcs_host);
|
||||||
free(data->dsts_host);
|
free(data->dsts_host);
|
||||||
|
#endif
|
||||||
|
|
||||||
free(data->streams);
|
free(data->streams);
|
||||||
free(data->send_reqs);
|
free(data->send_reqs);
|
||||||
@@ -924,14 +841,6 @@ acPackCommData(const Device device, const int3* a0s, CommData* data)
|
|||||||
acKernelPackData(data->streams[i], device->vba, a0s[i], data->srcs[i]);
|
acKernelPackData(data->streams[i], device->vba, a0s[i], data->srcs[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
|
||||||
acTransferCommDataToHost(const Device device, CommData* data)
|
|
||||||
{
|
|
||||||
cudaSetDevice(device->id);
|
|
||||||
for (size_t i = 0; i < data->count; ++i)
|
|
||||||
acTransferPackedDataToHost(device, data->streams[i], data->srcs[i], &data->srcs_host[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
acUnpackCommData(const Device device, const int3* b0s, CommData* data)
|
acUnpackCommData(const Device device, const int3* b0s, CommData* data)
|
||||||
{
|
{
|
||||||
@@ -941,6 +850,15 @@ acUnpackCommData(const Device device, const int3* b0s, CommData* data)
|
|||||||
acKernelUnpackData(data->streams[i], data->dsts[i], b0s[i], device->vba);
|
acKernelUnpackData(data->streams[i], data->dsts[i], b0s[i], device->vba);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
|
static void
|
||||||
|
acTransferCommDataToHost(const Device device, CommData* data)
|
||||||
|
{
|
||||||
|
cudaSetDevice(device->id);
|
||||||
|
for (size_t i = 0; i < data->count; ++i)
|
||||||
|
acTransferPackedDataToHost(device, data->streams[i], data->srcs[i], &data->srcs_host[i]);
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
acTransferCommDataToDevice(const Device device, CommData* data)
|
acTransferCommDataToDevice(const Device device, CommData* data)
|
||||||
{
|
{
|
||||||
@@ -948,6 +866,7 @@ acTransferCommDataToDevice(const Device device, CommData* data)
|
|||||||
for (size_t i = 0; i < data->count; ++i)
|
for (size_t i = 0; i < data->count; ++i)
|
||||||
acTransferPackedDataToDevice(device, data->streams[i], data->dsts_host[i], &data->dsts[i]);
|
acTransferPackedDataToDevice(device, data->streams[i], data->dsts_host[i], &data->dsts[i]);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static AcResult
|
static AcResult
|
||||||
acTransferCommData(const Device device, //
|
acTransferCommData(const Device device, //
|
||||||
@@ -995,10 +914,11 @@ acTransferCommData(const Device device, //
|
|||||||
|
|
||||||
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
|
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
|
||||||
|
|
||||||
// PackedData src = data->srcs[a_idx];
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
// PackedData dst = data->dsts[b_idx];
|
|
||||||
// PackedData src = data->srcs_host[a_idx];
|
|
||||||
PackedData dst = data->dsts_host[b_idx];
|
PackedData dst = data->dsts_host[b_idx];
|
||||||
|
#else
|
||||||
|
PackedData dst = data->dsts[b_idx];
|
||||||
|
#endif
|
||||||
|
|
||||||
const int3 pid3d = getPid3D(pid, decomp);
|
const int3 pid3d = getPid3D(pid, decomp);
|
||||||
MPI_Irecv(dst.data, count, datatype, getPid(pid3d - neighbor, decomp),
|
MPI_Irecv(dst.data, count, datatype, getPid(pid3d - neighbor, decomp),
|
||||||
@@ -1030,10 +950,11 @@ acTransferCommData(const Device device, //
|
|||||||
|
|
||||||
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
|
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
|
||||||
|
|
||||||
// PackedData src = data->srcs[a_idx];
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
// PackedData dst = data->dsts[b_idx];
|
|
||||||
PackedData src = data->srcs_host[a_idx];
|
PackedData src = data->srcs_host[a_idx];
|
||||||
// PackedData dst = data->dsts_host[b_idx];
|
#else
|
||||||
|
PackedData src = data->srcs[a_idx];
|
||||||
|
#endif
|
||||||
|
|
||||||
const int3 pid3d = getPid3D(pid, decomp);
|
const int3 pid3d = getPid3D(pid, decomp);
|
||||||
|
|
||||||
@@ -1198,6 +1119,7 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
acPackCommData(device, sidexz_a0s, &sidexz_data);
|
acPackCommData(device, sidexz_a0s, &sidexz_data);
|
||||||
acPackCommData(device, sideyz_a0s, &sideyz_data);
|
acPackCommData(device, sideyz_a0s, &sideyz_data);
|
||||||
|
|
||||||
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
acTransferCommDataToHost(device, &corner_data);
|
acTransferCommDataToHost(device, &corner_data);
|
||||||
acTransferCommDataToHost(device, &edgex_data);
|
acTransferCommDataToHost(device, &edgex_data);
|
||||||
acTransferCommDataToHost(device, &edgey_data);
|
acTransferCommDataToHost(device, &edgey_data);
|
||||||
@@ -1205,6 +1127,7 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
acTransferCommDataToHost(device, &sidexy_data);
|
acTransferCommDataToHost(device, &sidexy_data);
|
||||||
acTransferCommDataToHost(device, &sidexz_data);
|
acTransferCommDataToHost(device, &sidexz_data);
|
||||||
acTransferCommDataToHost(device, &sideyz_data);
|
acTransferCommDataToHost(device, &sideyz_data);
|
||||||
|
#endif
|
||||||
|
|
||||||
acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
|
acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
|
||||||
acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
|
acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
|
||||||
@@ -1222,6 +1145,7 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
acTransferCommDataWait(sidexz_data);
|
acTransferCommDataWait(sidexz_data);
|
||||||
acTransferCommDataWait(sideyz_data);
|
acTransferCommDataWait(sideyz_data);
|
||||||
|
|
||||||
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
acTransferCommDataToDevice(device, &corner_data);
|
acTransferCommDataToDevice(device, &corner_data);
|
||||||
acTransferCommDataToDevice(device, &edgex_data);
|
acTransferCommDataToDevice(device, &edgex_data);
|
||||||
acTransferCommDataToDevice(device, &edgey_data);
|
acTransferCommDataToDevice(device, &edgey_data);
|
||||||
@@ -1229,6 +1153,7 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
acTransferCommDataToDevice(device, &sidexy_data);
|
acTransferCommDataToDevice(device, &sidexy_data);
|
||||||
acTransferCommDataToDevice(device, &sidexz_data);
|
acTransferCommDataToDevice(device, &sidexz_data);
|
||||||
acTransferCommDataToDevice(device, &sideyz_data);
|
acTransferCommDataToDevice(device, &sideyz_data);
|
||||||
|
#endif
|
||||||
|
|
||||||
acUnpackCommData(device, corner_b0s, &corner_data);
|
acUnpackCommData(device, corner_b0s, &corner_data);
|
||||||
acUnpackCommData(device, edgex_b0s, &edgex_data);
|
acUnpackCommData(device, edgex_b0s, &edgex_data);
|
||||||
@@ -1244,6 +1169,7 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
timer_reset(&ttot);
|
timer_reset(&ttot);
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
|
||||||
acPackCommData(device, corner_a0s, &corner_data);
|
acPackCommData(device, corner_a0s, &corner_data);
|
||||||
acPackCommData(device, edgex_a0s, &edgex_data);
|
acPackCommData(device, edgex_a0s, &edgex_data);
|
||||||
@@ -1253,6 +1179,7 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
acPackCommData(device, sidexz_a0s, &sidexz_data);
|
acPackCommData(device, sidexz_a0s, &sidexz_data);
|
||||||
acPackCommData(device, sideyz_a0s, &sideyz_data);
|
acPackCommData(device, sideyz_a0s, &sideyz_data);
|
||||||
|
|
||||||
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
acTransferCommDataToHost(device, &corner_data);
|
acTransferCommDataToHost(device, &corner_data);
|
||||||
acTransferCommDataToHost(device, &edgex_data);
|
acTransferCommDataToHost(device, &edgex_data);
|
||||||
acTransferCommDataToHost(device, &edgey_data);
|
acTransferCommDataToHost(device, &edgey_data);
|
||||||
@@ -1260,6 +1187,7 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
acTransferCommDataToHost(device, &sidexy_data);
|
acTransferCommDataToHost(device, &sidexy_data);
|
||||||
acTransferCommDataToHost(device, &sidexz_data);
|
acTransferCommDataToHost(device, &sidexz_data);
|
||||||
acTransferCommDataToHost(device, &sideyz_data);
|
acTransferCommDataToHost(device, &sideyz_data);
|
||||||
|
#endif
|
||||||
|
|
||||||
acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
|
acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
|
||||||
acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
|
acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
|
||||||
@@ -1277,6 +1205,7 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
acTransferCommDataWait(sidexz_data);
|
acTransferCommDataWait(sidexz_data);
|
||||||
acTransferCommDataWait(sideyz_data);
|
acTransferCommDataWait(sideyz_data);
|
||||||
|
|
||||||
|
#if MPI_GPUDIRECT_DISABLED
|
||||||
acTransferCommDataToDevice(device, &corner_data);
|
acTransferCommDataToDevice(device, &corner_data);
|
||||||
acTransferCommDataToDevice(device, &edgex_data);
|
acTransferCommDataToDevice(device, &edgex_data);
|
||||||
acTransferCommDataToDevice(device, &edgey_data);
|
acTransferCommDataToDevice(device, &edgey_data);
|
||||||
@@ -1284,6 +1213,7 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
acTransferCommDataToDevice(device, &sidexy_data);
|
acTransferCommDataToDevice(device, &sidexy_data);
|
||||||
acTransferCommDataToDevice(device, &sidexz_data);
|
acTransferCommDataToDevice(device, &sidexz_data);
|
||||||
acTransferCommDataToDevice(device, &sideyz_data);
|
acTransferCommDataToDevice(device, &sideyz_data);
|
||||||
|
#endif
|
||||||
|
|
||||||
acUnpackCommData(device, corner_b0s, &corner_data);
|
acUnpackCommData(device, corner_b0s, &corner_data);
|
||||||
acUnpackCommData(device, edgex_b0s, &edgex_data);
|
acUnpackCommData(device, edgex_b0s, &edgex_data);
|
||||||
@@ -1293,63 +1223,21 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
acUnpackCommData(device, sidexz_b0s, &sidexz_data);
|
acUnpackCommData(device, sidexz_b0s, &sidexz_data);
|
||||||
acUnpackCommData(device, sideyz_b0s, &sideyz_data);
|
acUnpackCommData(device, sideyz_b0s, &sideyz_data);
|
||||||
|
|
||||||
/*
|
|
||||||
acPackCommData(device, corner_a0s, &corner_data);
|
|
||||||
acPackCommData(device, edgex_a0s, &edgex_data);
|
|
||||||
acPackCommData(device, edgey_a0s, &edgey_data);
|
|
||||||
acPackCommData(device, edgez_a0s, &edgez_data);
|
|
||||||
acPackCommData(device, sidexy_a0s, &sidexy_data);
|
|
||||||
acPackCommData(device, sidexz_a0s, &sidexz_data);
|
|
||||||
acPackCommData(device, sideyz_a0s, &sideyz_data);
|
|
||||||
|
|
||||||
acTransferCommDataToHost(device, &corner_data);
|
|
||||||
acTransferCommDataToHost(device, &edgex_data);
|
|
||||||
acTransferCommDataToHost(device, &edgey_data);
|
|
||||||
acTransferCommDataToHost(device, &edgez_data);
|
|
||||||
acTransferCommDataToHost(device, &sidexy_data);
|
|
||||||
acTransferCommDataToHost(device, &sidexz_data);
|
|
||||||
acTransferCommDataToHost(device, &sideyz_data);
|
|
||||||
|
|
||||||
acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
|
|
||||||
acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
|
|
||||||
acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
|
|
||||||
acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
|
|
||||||
acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
|
|
||||||
acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
|
|
||||||
acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
|
|
||||||
|
|
||||||
acTransferCommDataWait(corner_data);
|
|
||||||
acTransferCommDataWait(edgex_data);
|
|
||||||
acTransferCommDataWait(edgey_data);
|
|
||||||
acTransferCommDataWait(edgez_data);
|
|
||||||
acTransferCommDataWait(sidexy_data);
|
|
||||||
acTransferCommDataWait(sidexz_data);
|
|
||||||
acTransferCommDataWait(sideyz_data);
|
|
||||||
|
|
||||||
acTransferCommDataToDevice(device, &corner_data);
|
|
||||||
acTransferCommDataToDevice(device, &edgex_data);
|
|
||||||
acTransferCommDataToDevice(device, &edgey_data);
|
|
||||||
acTransferCommDataToDevice(device, &edgez_data);
|
|
||||||
acTransferCommDataToDevice(device, &sidexy_data);
|
|
||||||
acTransferCommDataToDevice(device, &sidexz_data);
|
|
||||||
acTransferCommDataToDevice(device, &sideyz_data);
|
|
||||||
|
|
||||||
acUnpackCommData(device, corner_b0s, &corner_data);
|
|
||||||
acUnpackCommData(device, edgex_b0s, &edgex_data);
|
|
||||||
acUnpackCommData(device, edgey_b0s, &edgey_data);
|
|
||||||
acUnpackCommData(device, edgez_b0s, &edgez_data);
|
|
||||||
acUnpackCommData(device, sidexy_b0s, &sidexy_data);
|
|
||||||
acUnpackCommData(device, sidexz_b0s, &sidexz_data);
|
|
||||||
acUnpackCommData(device, sideyz_b0s, &sideyz_data);
|
|
||||||
*/
|
|
||||||
|
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
int pid;
|
const double msec = timer_diff_nsec(ttot) / 1e6;
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
int pid, nprocs;
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
if (!pid) {
|
if (!pid) {
|
||||||
printf("---------------------------Total: ");
|
printf("--- Total communication time per step: %f ms\n", msec);
|
||||||
timer_diff_print(ttot);
|
|
||||||
|
// Write out to file
|
||||||
|
FILE* fp = fopen("benchmark.result", "a+");
|
||||||
|
fprintf(fp, "%d, %f\n", nprocs, msec);
|
||||||
|
fclose(fp);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Dealloc
|
// Dealloc
|
||||||
|
Reference in New Issue
Block a user