<q:::qqq!!!:::q:[2~:wqMer§§gccc:qq[2~: branch 'master' of
https://bitbucket.org/jpekkila/astaroth:q Z bin/sh: 1: !:: not .>.Merge branch 'master' of https://bitbucket.org/jpekkila/astaroth
This commit is contained in:
@@ -180,21 +180,21 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
|
|||||||
// VBA in/out
|
// VBA in/out
|
||||||
const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
|
const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMallocManaged(&device->vba.in[i], vba_size_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMallocManaged(&device->vba.out[i], vba_size_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
|
||||||
}
|
}
|
||||||
// VBA Profiles
|
// VBA Profiles
|
||||||
const size_t profile_size_bytes = sizeof(AcReal) * max(device_config.int_params[AC_mx],
|
const size_t profile_size_bytes = sizeof(AcReal) * max(device_config.int_params[AC_mx],
|
||||||
max(device_config.int_params[AC_my],
|
max(device_config.int_params[AC_my],
|
||||||
device_config.int_params[AC_mz]));
|
device_config.int_params[AC_mz]));
|
||||||
for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
|
for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMallocManaged(&device->vba.profiles[i], profile_size_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.profiles[i], profile_size_bytes));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reductions
|
// Reductions
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMallocManaged(&device->reduce_scratchpad,
|
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
|
||||||
acVertexBufferCompdomainSizeBytes(device_config)));
|
acVertexBufferCompdomainSizeBytes(device_config)));
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMallocManaged(&device->reduce_result, sizeof(AcReal)));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
|
||||||
|
|
||||||
#if PACKED_DATA_TRANSFERS
|
#if PACKED_DATA_TRANSFERS
|
||||||
// Allocate data required for packed transfers here (cudaMalloc)
|
// Allocate data required for packed transfers here (cudaMalloc)
|
||||||
@@ -862,7 +862,63 @@ acDeviceGatherMeshMPI(const AcMesh src, AcMesh* dst)
|
|||||||
static AcResult
|
static AcResult
|
||||||
acDeviceCommunicateHalosMPI(const Device device)
|
acDeviceCommunicateHalosMPI(const Device device)
|
||||||
{
|
{
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
MPI_Datatype datatype = MPI_FLOAT;
|
||||||
|
if (sizeof(AcReal) == 8)
|
||||||
|
datatype = MPI_DOUBLE;
|
||||||
|
|
||||||
|
int pid, num_processes;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
|
||||||
|
|
||||||
|
const size_t count = device->local_config.int_params[AC_mx] *
|
||||||
|
device->local_config.int_params[AC_my] * NGHOST;
|
||||||
|
MPI_Request recv_requests[2*NUM_VTXBUF_HANDLES];
|
||||||
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
|
{ // Front
|
||||||
|
// ...|ooooxxx|... -> xxx|ooooooo|...
|
||||||
|
const size_t src_idx = acVertexBufferIdx(0, 0, device->local_config.int_params[AC_nz],
|
||||||
|
device->local_config);
|
||||||
|
const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config);
|
||||||
|
const int send_pid = (pid + 1) % num_processes;
|
||||||
|
const int recv_pid = (pid + num_processes - 1) % num_processes;
|
||||||
|
|
||||||
|
MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, i, MPI_COMM_WORLD,
|
||||||
|
&recv_requests[i]);
|
||||||
|
|
||||||
|
MPI_Request request;
|
||||||
|
MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, i, MPI_COMM_WORLD,
|
||||||
|
&request);
|
||||||
|
|
||||||
|
}
|
||||||
|
{ // Back
|
||||||
|
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
||||||
|
const size_t src_idx = acVertexBufferIdx(0, 0, NGHOST, device->local_config);
|
||||||
|
const size_t dst_idx = acVertexBufferIdx(
|
||||||
|
0, 0, NGHOST + device->local_config.int_params[AC_nz], device->local_config);
|
||||||
|
const int send_pid = (pid + num_processes - 1) % num_processes;
|
||||||
|
const int recv_pid = (pid + 1) % num_processes;
|
||||||
|
|
||||||
|
MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, NUM_VTXBUF_HANDLES + i,
|
||||||
|
MPI_COMM_WORLD, &recv_requests[NUM_VTXBUF_HANDLES + i]);
|
||||||
|
|
||||||
|
MPI_Request request;
|
||||||
|
MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid,
|
||||||
|
NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, &request);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
MPI_Status recv_statuses[2*NUM_VTXBUF_HANDLES];
|
||||||
|
MPI_Waitall(2*NUM_VTXBUF_HANDLES, recv_requests, recv_statuses);
|
||||||
|
|
||||||
|
return AC_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
/** NOTE: Assumes 1 process per GPU */
|
||||||
|
static AcResult
|
||||||
|
acDeviceCommunicateHalosMPI(const Device device)
|
||||||
|
{
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
MPI_Datatype datatype = MPI_FLOAT;
|
MPI_Datatype datatype = MPI_FLOAT;
|
||||||
if (sizeof(AcReal) == 8)
|
if (sizeof(AcReal) == 8)
|
||||||
datatype = MPI_DOUBLE;
|
datatype = MPI_DOUBLE;
|
||||||
@@ -877,6 +933,7 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
MPI_Status status_front[NUM_VTXBUF_HANDLES];
|
MPI_Status status_front[NUM_VTXBUF_HANDLES];
|
||||||
MPI_Request request_back[NUM_VTXBUF_HANDLES];
|
MPI_Request request_back[NUM_VTXBUF_HANDLES];
|
||||||
MPI_Status status_back[NUM_VTXBUF_HANDLES];
|
MPI_Status status_back[NUM_VTXBUF_HANDLES];
|
||||||
|
//#pragma omp parallel for
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
{ // Front
|
{ // Front
|
||||||
// ...|ooooxxx|... -> xxx|ooooooo|...
|
// ...|ooooxxx|... -> xxx|ooooooo|...
|
||||||
@@ -900,6 +957,7 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, &request_back[i]);
|
NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, &request_back[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
//#pragma omp parallel for
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
{ // Front
|
{ // Front
|
||||||
// ...|ooooxxx|... -> xxx|ooooooo|...
|
// ...|ooooxxx|... -> xxx|ooooooo|...
|
||||||
@@ -921,6 +979,7 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
MPI_COMM_WORLD, &status_back[i]);
|
MPI_COMM_WORLD, &status_back[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
//#pragma omp parallel for
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
MPI_Wait(&request_front[i], &status_front[i]);
|
MPI_Wait(&request_front[i], &status_front[i]);
|
||||||
MPI_Wait(&request_back[i], &status_back[i]);
|
MPI_Wait(&request_back[i], &status_back[i]);
|
||||||
@@ -968,6 +1027,7 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
*/
|
*/
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static void
|
static void
|
||||||
acHostCommunicateHalosMPI(AcMesh* submesh)
|
acHostCommunicateHalosMPI(AcMesh* submesh)
|
||||||
@@ -1036,6 +1096,9 @@ acDeviceRunMPITest(void)
|
|||||||
{
|
{
|
||||||
int num_processes, pid;
|
int num_processes, pid;
|
||||||
MPI_Init(NULL, NULL);
|
MPI_Init(NULL, NULL);
|
||||||
|
//int provided;
|
||||||
|
//MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided); // Hybrid MP + MPI
|
||||||
|
//ERRCHK_ALWAYS(provided == MPI_THREAD_MULTIPLE);
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
|
MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
|
|
||||||
@@ -1067,12 +1130,18 @@ acDeviceRunMPITest(void)
|
|||||||
#endif /* MPIX_CUDA_AWARE_SUPPORT */
|
#endif /* MPIX_CUDA_AWARE_SUPPORT */
|
||||||
//////// Borrowing end
|
//////// Borrowing end
|
||||||
|
|
||||||
|
int direct = getenv("MPICH_RDMA_ENABLED_CUDA")==NULL?0:atoi(getenv ("MPICH_RDMA_ENABLED_CUDA"));
|
||||||
|
if(direct != 1){
|
||||||
|
printf ("MPICH_RDMA_ENABLED_CUDA not enabled!\n");
|
||||||
|
exit (EXIT_FAILURE);
|
||||||
|
}
|
||||||
// Create model and candidate meshes
|
// Create model and candidate meshes
|
||||||
AcMeshInfo info;
|
AcMeshInfo info;
|
||||||
acLoadConfig(AC_DEFAULT_CONFIG, &info);
|
acLoadConfig(AC_DEFAULT_CONFIG, &info);
|
||||||
|
|
||||||
const int nn = 256;
|
const int nn = 256;
|
||||||
info.int_params[AC_nx] = info.int_params[AC_ny] = info.int_params[AC_nz] = nn;
|
info.int_params[AC_nx] = info.int_params[AC_ny] = nn;
|
||||||
|
info.int_params[AC_nz] = 4*512;
|
||||||
acUpdateConfig(&info);
|
acUpdateConfig(&info);
|
||||||
|
|
||||||
AcMesh model, candidate;
|
AcMesh model, candidate;
|
||||||
@@ -1118,6 +1187,21 @@ acDeviceRunMPITest(void)
|
|||||||
acDeviceCreate(0, submesh_info, &device);
|
acDeviceCreate(0, submesh_info, &device);
|
||||||
acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);
|
acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);
|
||||||
|
|
||||||
|
|
||||||
|
// Warmup
|
||||||
|
acDeviceSynchronizeStream(device, STREAM_ALL);
|
||||||
|
for (int i = 0; i < 10; ++i) {
|
||||||
|
///// Communication start
|
||||||
|
{
|
||||||
|
const int3 start = (int3){0, 0, NGHOST};
|
||||||
|
const int3 end = (int3){subgrid_m.x, subgrid_m.y, subgrid_m.z - NGHOST};
|
||||||
|
acDevicePeriodicBoundconds(device, STREAM_DEFAULT, start, end);
|
||||||
|
}
|
||||||
|
acDeviceSynchronizeStream(device, STREAM_DEFAULT);
|
||||||
|
// Includes periodic bounds at first and last ghost zone
|
||||||
|
acDeviceCommunicateHalosMPI(device);
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////// Timer start
|
////////////////////////////// Timer start
|
||||||
const int num_iters = 100;
|
const int num_iters = 100;
|
||||||
Timer total_time;
|
Timer total_time;
|
||||||
@@ -1131,7 +1215,7 @@ acDeviceRunMPITest(void)
|
|||||||
}
|
}
|
||||||
#if 1 // GPU-GPU if CUDA-aware MPI, otherwise managed CPU-GPU-GPU-CPU
|
#if 1 // GPU-GPU if CUDA-aware MPI, otherwise managed CPU-GPU-GPU-CPU
|
||||||
acDeviceSynchronizeStream(device, STREAM_DEFAULT);
|
acDeviceSynchronizeStream(device, STREAM_DEFAULT);
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
acDeviceCommunicateHalosMPI(
|
acDeviceCommunicateHalosMPI(
|
||||||
device); // Includes periodic bounds at first and last ghost zone
|
device); // Includes periodic bounds at first and last ghost zone
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
@@ -1146,6 +1230,7 @@ acDeviceRunMPITest(void)
|
|||||||
const double ms_elapsed = timer_diff_nsec(total_time) / 1e6;
|
const double ms_elapsed = timer_diff_nsec(total_time) / 1e6;
|
||||||
printf("vertices: %d^3, iterations: %d\n", nn, num_iters);
|
printf("vertices: %d^3, iterations: %d\n", nn, num_iters);
|
||||||
printf("Total time: %f ms\n", ms_elapsed);
|
printf("Total time: %f ms\n", ms_elapsed);
|
||||||
|
printf("Time per step: %f ms\n", ms_elapsed / num_iters);
|
||||||
}
|
}
|
||||||
////////////////////////////// Timer end
|
////////////////////////////// Timer end
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user