Added very experimental implementation for mixed precision. Comm is done with f32 and comp with f64.

This commit is contained in:
jpekkila
2020-10-28 12:56:34 +02:00
parent c1f2a6c340
commit 0a2827593c
2 changed files with 12 additions and 5 deletions

View File

@@ -14,7 +14,7 @@
#define MPI_DECOMPOSITION_AXES (3)
#define MPI_COMPUTE_ENABLED (1)
#define MPI_COMM_ENABLED (1)
#define MPI_INCL_CORNERS (1)
#define MPI_INCL_CORNERS (0)
#define MPI_USE_PINNED (0) // Do inter-node comm with pinned memory
#define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost
@@ -721,7 +721,7 @@ acCreatePackedDataHost(const int3 dims)
data.dims = dims;
const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
data.data = (AcReal*)malloc(bytes);
data.data = (AcRealPacked*)malloc(bytes);
ERRCHK_ALWAYS(data.data);
return data;
@@ -1132,8 +1132,13 @@ acTransferCommData(const Device device, //
cudaSetDevice(device->id);
MPI_Datatype datatype = MPI_FLOAT;
if (sizeof(AcReal) == 8)
if (sizeof(data->srcs[0].data[0]) == 2) {
datatype = MPI_SHORT; // TODO CONFIRM THAT IS CORRECTLY CAST TO HALF
} else if (sizeof(data->srcs[0].data[0]) == 4) {
datatype = MPI_FLOAT;
} else {
datatype = MPI_DOUBLE;
}
int nprocs, pid;
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

View File

@@ -8,11 +8,13 @@
#define MPI_GPUDIRECT_DISABLED (0)
#endif // AC_MPI_ENABLED
typedef float AcRealPacked;
typedef struct {
int3 dims;
AcReal* data;
AcRealPacked* data;
AcReal* data_pinned;
AcRealPacked* data_pinned;
bool pinned = false; // Set if data was received to pinned memory
} PackedData;