3D decomposition is now done using Morton order instead of linear indexing
This commit is contained in:
@@ -468,29 +468,62 @@ mod(const int a, const int b)
|
|||||||
const int r = a % b;
|
const int r = a % b;
|
||||||
return r < 0 ? r + b : r;
|
return r < 0 ? r + b : r;
|
||||||
}
|
}
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
static int
|
int
|
||||||
getPid(const int3 pid, const int3 decomposition)
|
getPid(int3 pid, const int3 decomposition)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
return mod(pid.x, decomposition.x) + //
|
return mod(pid.x, decomposition.x) + //
|
||||||
mod(pid.y, decomposition.y) * decomposition.x + //
|
mod(pid.y, decomposition.y) * decomposition.x + //
|
||||||
mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
|
mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
|
||||||
|
|
||||||
|
*/
|
||||||
|
pid.x = mod(pid.x, decomposition.x);
|
||||||
|
pid.y = mod(pid.y, decomposition.y);
|
||||||
|
pid.z = mod(pid.z, decomposition.z);
|
||||||
|
|
||||||
|
uint64_t i = 0;
|
||||||
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
|
const uint64_t mask = 0x1l << bit;
|
||||||
|
i |= (((uint64_t)pid.x & mask) << 0) << 2 * bit;
|
||||||
|
i |= (((uint64_t)pid.y & mask) << 1) << 2 * bit;
|
||||||
|
i |= (((uint64_t)pid.z & mask) << 2) << 2 * bit;
|
||||||
|
}
|
||||||
|
return (int)i;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int3
|
int3
|
||||||
getPid3D(const int pid, const int3 decomposition)
|
getPid3D(const int pid, const int3 decomposition)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
const int3 pid3d = (int3){
|
const int3 pid3d = (int3){
|
||||||
mod(pid, decomposition.x),
|
mod(pid, decomposition.x),
|
||||||
mod(pid / decomposition.x, decomposition.y),
|
mod(pid / decomposition.x, decomposition.y),
|
||||||
(pid / (decomposition.x * decomposition.y)),
|
(pid / (decomposition.x * decomposition.y)),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
ERRCHK_ALWAYS(getPid(pid3d, decomposition) == pid);
|
||||||
return pid3d;
|
return pid3d;
|
||||||
|
*/
|
||||||
|
uint64_t i, j, k;
|
||||||
|
i = j = k = 0;
|
||||||
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
|
const uint64_t mask = 0x1l << 3 * bit;
|
||||||
|
i |= (((uint64_t)pid & (mask << 0)) >> 2 * bit) >> 0;
|
||||||
|
j |= (((uint64_t)pid & (mask << 1)) >> 2 * bit) >> 1;
|
||||||
|
k |= (((uint64_t)pid & (mask << 2)) >> 2 * bit) >> 2;
|
||||||
|
}
|
||||||
|
const int3 pid3d = (int3){i, j, k};
|
||||||
|
ERRCHK(getPid(pid3d, decomposition) == pid);
|
||||||
|
return pid3d;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Note: assumes that contiguous pids are on the same node and there is one process per GPU. I.e.
|
/** Note: assumes that contiguous pids are on the same node and there is one process per GPU. I.e.
|
||||||
* pids are linearly mapped i + j * dx + k * dx * dy. */
|
* pids are linearly mapped i + j * dx + k * dx * dy. */
|
||||||
static bool
|
static inline bool
|
||||||
onTheSameNode(const int pid_a, const int pid_b)
|
onTheSameNode(const int pid_a, const int pid_b)
|
||||||
{
|
{
|
||||||
int devices_per_node = -1;
|
int devices_per_node = -1;
|
||||||
|
Reference in New Issue
Block a user