From 0a2827593c3479f1b117476663da70f6e6263972 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 28 Oct 2020 12:56:34 +0200 Subject: [PATCH] Added very experimental implementation for mixed precision. Comm is done with f32 and comp with f64. --- src/core/device.cc | 11 ++++++++--- src/core/kernels/kernels.h | 6 ++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/core/device.cc b/src/core/device.cc index a6ec793..0ee987f 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -14,7 +14,7 @@ #define MPI_DECOMPOSITION_AXES (3) #define MPI_COMPUTE_ENABLED (1) #define MPI_COMM_ENABLED (1) -#define MPI_INCL_CORNERS (1) +#define MPI_INCL_CORNERS (0) #define MPI_USE_PINNED (0) // Do inter-node comm with pinned memory #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost @@ -721,7 +721,7 @@ acCreatePackedDataHost(const int3 dims) data.dims = dims; const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES; - data.data = (AcReal*)malloc(bytes); + data.data = (AcRealPacked*)malloc(bytes); ERRCHK_ALWAYS(data.data); return data; @@ -1132,8 +1132,13 @@ acTransferCommData(const Device device, // cudaSetDevice(device->id); MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) + if (sizeof(data->srcs[0].data[0]) == 2) { + datatype = MPI_SHORT; // TODO CONFIRM THAT IS CORRECTLY CAST TO HALF + } else if (sizeof(data->srcs[0].data[0]) == 4) { + datatype = MPI_FLOAT; + } else { datatype = MPI_DOUBLE; + } int nprocs, pid; MPI_Comm_size(MPI_COMM_WORLD, &nprocs); diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h index fc9c745..513c5e4 100644 --- a/src/core/kernels/kernels.h +++ b/src/core/kernels/kernels.h @@ -8,11 +8,13 @@ #define MPI_GPUDIRECT_DISABLED (0) #endif // AC_MPI_ENABLED +typedef float AcRealPacked; + typedef struct { int3 dims; - AcReal* data; + AcRealPacked* data; - AcReal* data_pinned; + AcRealPacked* data_pinned; bool pinned = false; // Set if data was received to pinned memory } PackedData;