Rewrote partitioning code
This commit is contained in:
@@ -462,18 +462,31 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
|
|||||||
#if AC_MPI_ENABLED
|
#if AC_MPI_ENABLED
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
|
|
||||||
static int
|
|
||||||
mod(const int a, const int b)
|
|
||||||
{
|
|
||||||
const int r = a % b;
|
|
||||||
return r < 0 ? r + b : r;
|
|
||||||
}
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint64_t x, y, z;
|
uint64_t x, y, z;
|
||||||
} uint3_64;
|
} uint3_64;
|
||||||
|
|
||||||
|
static uint3_64
|
||||||
|
operator+(const uint3_64& a, const uint3_64& b)
|
||||||
|
{
|
||||||
|
return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
|
||||||
|
}
|
||||||
|
|
||||||
|
static int3
|
||||||
|
make_int3(const uint3_64 a)
|
||||||
|
{
|
||||||
|
return (int3){(int)a.x, (int)a.y, (int)a.z};
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint64_t
|
||||||
|
mod(const int a, const int b)
|
||||||
|
{
|
||||||
|
const int r = a % b;
|
||||||
|
return r < 0 ? r + b : r;
|
||||||
|
}
|
||||||
|
|
||||||
static uint3_64
|
static uint3_64
|
||||||
morton3D(const uint64_t pid)
|
morton3D(const uint64_t pid)
|
||||||
{
|
{
|
||||||
@@ -502,118 +515,55 @@ morton1D(const uint3_64 pid)
|
|||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
static uint3_64
|
||||||
getPid(int3 pid, const int3 decomposition)
|
decompose(const uint64_t target)
|
||||||
{
|
{
|
||||||
/*
|
// This is just so beautifully elegant. Complex and efficient decomposition
|
||||||
return mod(pid.x, decomposition.x) + //
|
// in just one line of code.
|
||||||
mod(pid.y, decomposition.y) * decomposition.x + //
|
uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
|
||||||
mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
|
|
||||||
|
|
||||||
*/
|
ERRCHK_ALWAYS(p.x * p.y * p.z == target);
|
||||||
pid.x = mod(pid.x, decomposition.x);
|
return p;
|
||||||
pid.y = mod(pid.y, decomposition.y);
|
|
||||||
pid.z = mod(pid.z, decomposition.z);
|
|
||||||
|
|
||||||
uint64_t i = 0;
|
|
||||||
for (int bit = 0; bit <= 21; ++bit) {
|
|
||||||
const uint64_t mask = 0x1l << bit;
|
|
||||||
i |= (((uint64_t)pid.x & mask) << 0) << 2 * bit;
|
|
||||||
i |= (((uint64_t)pid.y & mask) << 1) << 2 * bit;
|
|
||||||
i |= (((uint64_t)pid.z & mask) << 2) << 2 * bit;
|
|
||||||
}
|
|
||||||
return (int)i;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int3
|
static uint3_64
|
||||||
getPid3D(const int pid, const int3 decomposition)
|
wrap(const int3 i, const uint3_64 n)
|
||||||
{
|
{
|
||||||
/*
|
return (uint3_64){
|
||||||
const int3 pid3d = (int3){
|
mod(i.x, n.x),
|
||||||
mod(pid, decomposition.x),
|
mod(i.y, n.y),
|
||||||
mod(pid / decomposition.x, decomposition.y),
|
mod(i.z, n.z),
|
||||||
(pid / (decomposition.x * decomposition.y)),
|
};
|
||||||
};
|
|
||||||
|
|
||||||
ERRCHK_ALWAYS(getPid(pid3d, decomposition) == pid);
|
|
||||||
return pid3d;
|
|
||||||
*/
|
|
||||||
uint64_t i, j, k;
|
|
||||||
i = j = k = 0;
|
|
||||||
for (int bit = 0; bit <= 21; ++bit) {
|
|
||||||
const uint64_t mask = 0x1l << 3 * bit;
|
|
||||||
i |= (((uint64_t)pid & (mask << 0)) >> 2 * bit) >> 0;
|
|
||||||
j |= (((uint64_t)pid & (mask << 1)) >> 2 * bit) >> 1;
|
|
||||||
k |= (((uint64_t)pid & (mask << 2)) >> 2 * bit) >> 2;
|
|
||||||
}
|
|
||||||
const int3 pid3d = (int3){i, j, k};
|
|
||||||
ERRCHK(getPid(pid3d, decomposition) == pid);
|
|
||||||
return pid3d;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Note: assumes that contiguous pids are on the same node and there is one process per GPU. I.e.
|
static int
|
||||||
* pids are linearly mapped i + j * dx + k * dx * dy. */
|
getPid(const int3 pid_raw, const uint3_64 decomp)
|
||||||
|
{
|
||||||
|
const uint3_64 pid = wrap(pid_raw, decomp);
|
||||||
|
return (int)morton1D(pid);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int3
|
||||||
|
getPid3D(const uint64_t pid, const uint3_64 decomp)
|
||||||
|
{
|
||||||
|
const uint3_64 pid3D = morton3D(pid);
|
||||||
|
ERRCHK_ALWAYS(getPid(make_int3(pid3D), decomp) == (int)pid);
|
||||||
|
return (int3){(int)pid3D.x, (int)pid3D.y, (int)pid3D.z};
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Assumes that contiguous pids are on the same node and there is one process per GPU. */
|
||||||
static inline bool
|
static inline bool
|
||||||
onTheSameNode(const int pid_a, const int pid_b)
|
onTheSameNode(const uint64_t pid_a, const uint64_t pid_b)
|
||||||
{
|
{
|
||||||
int devices_per_node = -1;
|
int devices_per_node = -1;
|
||||||
cudaGetDeviceCount(&devices_per_node);
|
cudaGetDeviceCount(&devices_per_node);
|
||||||
|
|
||||||
const int node_a = pid_a / devices_per_node;
|
const uint64_t node_a = pid_a / devices_per_node;
|
||||||
const int node_b = pid_b / devices_per_node;
|
const uint64_t node_b = pid_b / devices_per_node;
|
||||||
|
|
||||||
return node_a == node_b;
|
return node_a == node_b;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int3
|
|
||||||
decompose(const int target)
|
|
||||||
{
|
|
||||||
// This is just so beautifully elegant. Complex and efficient decomposition
|
|
||||||
// in just one line of code.
|
|
||||||
uint3_64 p = morton3D(target - 1);
|
|
||||||
p = (uint3_64){p.x + 1, p.y + 1, p.z + 1};
|
|
||||||
|
|
||||||
if (p.x * p.y * p.z != target) {
|
|
||||||
fprintf(stderr, "Invalid number of processes! Cannot decompose the problem domain!\n");
|
|
||||||
fprintf(stderr, "Target nprocs: %d. Found: %d\n", target, p.x * p.y * p.z);
|
|
||||||
ERROR("Invalid nprocs");
|
|
||||||
return (int3){-1, -1, -1};
|
|
||||||
}
|
|
||||||
|
|
||||||
return (int3){p.x, p.y, p.z};
|
|
||||||
/*
|
|
||||||
if (target == 16)
|
|
||||||
return (int3){4, 2, 2};
|
|
||||||
if (target == 32)
|
|
||||||
return (int3){4, 4, 2};
|
|
||||||
if (target == 128)
|
|
||||||
return (int3){8, 4, 4};
|
|
||||||
if (target == 256)
|
|
||||||
return (int3){8, 8, 4};
|
|
||||||
|
|
||||||
int decomposition[] = {1, 1, 1};
|
|
||||||
|
|
||||||
int axis = 0;
|
|
||||||
while (decomposition[0] * decomposition[1] * decomposition[2] < target) {
|
|
||||||
++decomposition[axis];
|
|
||||||
axis = (axis + 1) % 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int found = decomposition[0] * decomposition[1] * decomposition[2];
|
|
||||||
if (found != target) {
|
|
||||||
fprintf(stderr, "Invalid number of processes! Cannot decompose the problem domain!\n");
|
|
||||||
fprintf(stderr, "Target nprocs: %d. Next allowed: %d\n", target, found);
|
|
||||||
ERROR("Invalid nprocs");
|
|
||||||
return (int3){-1, -1, -1};
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return (int3){decomposition[0], decomposition[1], decomposition[2]};
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
static PackedData
|
static PackedData
|
||||||
acCreatePackedData(const int3 dims)
|
acCreatePackedData(const int3 dims)
|
||||||
{
|
{
|
||||||
@@ -746,7 +696,7 @@ acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* dd
|
|||||||
|
|
||||||
// TODO: do with packed data
|
// TODO: do with packed data
|
||||||
static AcResult
|
static AcResult
|
||||||
acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
acDeviceDistributeMeshMPI(const AcMesh src, const uint3_64 decomposition, AcMesh* dst)
|
||||||
{
|
{
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
printf("Distributing mesh...\n");
|
printf("Distributing mesh...\n");
|
||||||
@@ -822,7 +772,7 @@ acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* ds
|
|||||||
|
|
||||||
// TODO: do with packed data
|
// TODO: do with packed data
|
||||||
static AcResult
|
static AcResult
|
||||||
acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
acDeviceGatherMeshMPI(const AcMesh src, const uint3_64 decomposition, AcMesh* dst)
|
||||||
{
|
{
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
printf("Gathering mesh...\n");
|
printf("Gathering mesh...\n");
|
||||||
@@ -1038,7 +988,7 @@ acTransferCommDataToDevice(const Device device, CommData* data)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if AC_MPI_RT_PINNING
|
#if AC_MPI_RT_PINNING
|
||||||
static void
|
static inline void
|
||||||
acPinCommData(const Device device, CommData* data)
|
acPinCommData(const Device device, CommData* data)
|
||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
@@ -1077,7 +1027,7 @@ acTransferCommData(const Device device, //
|
|||||||
int nprocs, pid;
|
int nprocs, pid;
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
const int3 decomp = decompose(nprocs);
|
const uint3_64 decomp = decompose(nprocs);
|
||||||
|
|
||||||
const int3 nn = (int3){
|
const int3 nn = (int3){
|
||||||
device->local_config.int_params[AC_nx],
|
device->local_config.int_params[AC_nx],
|
||||||
@@ -1230,7 +1180,7 @@ acTransferCommData(const Device device, //
|
|||||||
int nprocs, pid;
|
int nprocs, pid;
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
const int3 decomp = decompose(nprocs);
|
const uint3_64 decomp = decompose(nprocs);
|
||||||
|
|
||||||
const int3 nn = (int3){
|
const int3 nn = (int3){
|
||||||
device->local_config.int_params[AC_nx],
|
device->local_config.int_params[AC_nx],
|
||||||
@@ -1363,7 +1313,7 @@ acTransferCommData(const Device device, //
|
|||||||
int nprocs, pid;
|
int nprocs, pid;
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
const int3 decomp = decompose(nprocs);
|
const uint3_64 decomp = decompose(nprocs);
|
||||||
|
|
||||||
const int3 nn = (int3){
|
const int3 nn = (int3){
|
||||||
device->local_config.int_params[AC_nx],
|
device->local_config.int_params[AC_nx],
|
||||||
@@ -1467,7 +1417,7 @@ acTransferCommDataWait(const CommData data)
|
|||||||
typedef struct {
|
typedef struct {
|
||||||
Device device;
|
Device device;
|
||||||
AcMesh submesh;
|
AcMesh submesh;
|
||||||
int3 decomposition;
|
uint3_64 decomposition;
|
||||||
bool initialized;
|
bool initialized;
|
||||||
|
|
||||||
int3 nn;
|
int3 nn;
|
||||||
@@ -1508,11 +1458,11 @@ acGridInit(const AcMeshInfo info)
|
|||||||
printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs);
|
printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs);
|
||||||
|
|
||||||
// Decompose
|
// Decompose
|
||||||
AcMeshInfo submesh_info = info;
|
AcMeshInfo submesh_info = info;
|
||||||
const int3 decomposition = decompose(nprocs);
|
const uint3_64 decomposition = decompose(nprocs);
|
||||||
const int3 pid3d = getPid3D(pid, decomposition);
|
const int3 pid3d = getPid3D(pid, decomposition);
|
||||||
|
|
||||||
printf("Decomposition: %d, %d, %d\n", decomposition.x, decomposition.y, decomposition.z);
|
printf("Decomposition: %lu, %lu, %lu\n", decomposition.x, decomposition.y, decomposition.z);
|
||||||
printf("Process %d: (%d, %d, %d)\n", pid, pid3d.x, pid3d.y, pid3d.z);
|
printf("Process %d: (%d, %d, %d)\n", pid, pid3d.x, pid3d.y, pid3d.z);
|
||||||
ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0);
|
ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0);
|
||||||
ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0);
|
ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0);
|
||||||
@@ -1650,7 +1600,7 @@ acGridQuit(void)
|
|||||||
acDestroyCommData(grid.device, &grid.sideyz_data);
|
acDestroyCommData(grid.device, &grid.sideyz_data);
|
||||||
|
|
||||||
grid.initialized = false;
|
grid.initialized = false;
|
||||||
grid.decomposition = (int3){-1, -1, -1};
|
grid.decomposition = (uint3_64){0, 0, 0};
|
||||||
acMeshDestroy(&grid.submesh);
|
acMeshDestroy(&grid.submesh);
|
||||||
acDeviceDestroy(grid.device);
|
acDeviceDestroy(grid.device);
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user