diff --git a/src/core/device.cc b/src/core/device.cc index a6ec793..0ee987f 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -14,7 +14,7 @@ #define MPI_DECOMPOSITION_AXES (3) #define MPI_COMPUTE_ENABLED (1) #define MPI_COMM_ENABLED (1) -#define MPI_INCL_CORNERS (1) +#define MPI_INCL_CORNERS (0) #define MPI_USE_PINNED (0) // Do inter-node comm with pinned memory #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost @@ -721,7 +721,7 @@ acCreatePackedDataHost(const int3 dims) data.dims = dims; const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES; - data.data = (AcReal*)malloc(bytes); + data.data = (AcRealPacked*)malloc(bytes); ERRCHK_ALWAYS(data.data); return data; @@ -1132,8 +1132,13 @@ acTransferCommData(const Device device, // cudaSetDevice(device->id); MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) + if (sizeof(data->srcs[0].data[0]) == 2) { + datatype = MPI_SHORT; // TODO CONFIRM THAT IS CORRECTLY CAST TO HALF + } else if (sizeof(data->srcs[0].data[0]) == 4) { + datatype = MPI_FLOAT; + } else { datatype = MPI_DOUBLE; + } int nprocs, pid; MPI_Comm_size(MPI_COMM_WORLD, &nprocs); diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h index fc9c745..513c5e4 100644 --- a/src/core/kernels/kernels.h +++ b/src/core/kernels/kernels.h @@ -8,11 +8,13 @@ #define MPI_GPUDIRECT_DISABLED (0) #endif // AC_MPI_ENABLED +typedef float AcRealPacked; + typedef struct { int3 dims; - AcReal* data; + AcRealPacked* data; - AcReal* data_pinned; + AcRealPacked* data_pinned; bool pinned = false; // Set if data was received to pinned memory } PackedData;