#include "astaroth.h" #include #include "errchk.h" #include "math_utils.h" #include "timer_hires.h" #include "kernels/kernels.h" #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) #define MPI_GPUDIRECT_DISABLED (0) // Buffer through host memory, deprecated #define MPI_DECOMPOSITION_AXES (3) #define MPI_COMPUTE_ENABLED (1) #define MPI_COMM_ENABLED (1) #define MPI_INCL_CORNERS (1) #define MPI_USE_PINNED (0) // Do inter-node comm with pinned memory #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost #include // CUDA driver API (needed if MPI_USE_CUDA_DRIVER_PINNING is set) AcResult acDevicePrintInfo(const Device device) { const int device_id = device->id; cudaDeviceProp props; cudaGetDeviceProperties(&props, device_id); printf("--------------------------------------------------\n"); printf("Device Number: %d\n", device_id); const size_t bus_id_max_len = 128; char bus_id[bus_id_max_len]; cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id); printf(" PCI bus ID: %s\n", bus_id); printf(" Device name: %s\n", props.name); printf(" Compute capability: %d.%d\n", props.major, props.minor); // Compute printf(" Compute\n"); printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz printf(" Stream processors: %d\n", props.multiProcessorCount); printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio); printf( " Compute mode: %d\n", (int)props .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0 // Memory printf(" Global memory\n"); printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000)); printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth); printf(" Peak Memory Bandwidth (GiB/s): %f\n", 2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.)); printf(" ECC enabled: %d\n", props.ECCEnabled); // Memory usage size_t free_bytes, total_bytes; cudaMemGetInfo(&free_bytes, &total_bytes); const size_t used_bytes = total_bytes - free_bytes; printf(" Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024)); printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024)); printf(" Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024)); printf(" Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024)); printf(" Caches\n"); printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported); printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported); printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024)); // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error // MV: while compiling in TIARA gp cluster. Therefore commeted out. //!! printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024)); //!! printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024)); printf(" Other\n"); printf(" Warp size: %d\n", props.warpSize); // printf(" Single to double perf. ratio: %dx\n", // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA // versions printf(" Stream priorities supported: %d\n", props.streamPrioritiesSupported); printf("--------------------------------------------------\n"); return AC_SUCCESS; } AcResult acDeviceAutoOptimize(const Device device) { cudaSetDevice(device->id); const int3 start = (int3){ device->local_config.int_params[AC_nx_min], device->local_config.int_params[AC_ny_min], device->local_config.int_params[AC_nz_min], }; const int3 end = (int3){ device->local_config.int_params[AC_nx_max], device->local_config.int_params[AC_ny_max], device->local_config.int_params[AC_nz_max], }; return acKernelAutoOptimizeIntegration(start, end, device->vba); } AcResult acDeviceSynchronizeStream(const Device device, const Stream stream) { cudaSetDevice(device->id); if (stream == STREAM_ALL) { cudaDeviceSynchronize(); } else { cudaStreamSynchronize(device->streams[stream]); } return AC_SUCCESS; } AcResult acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle) { cudaSetDevice(id); // cudaDeviceReset(); // Would be good for safety, but messes stuff up if we want to emulate // multiple devices with a single GPU // Create Device struct device_s* device = (struct device_s*)malloc(sizeof(*device)); ERRCHK_ALWAYS(device); device->id = id; device->local_config = device_config; #if AC_VERBOSE acDevicePrintInfo(device); #endif // Check that the code was compiled for the proper GPU architecture #if AC_VERBOSE printf("Trying to run a dummy kernel. If this fails, make sure that your\n" "device supports the CUDA architecture you are compiling for.\n"); #endif printf("Testing CUDA... "); fflush(stdout); acKernelDummy(); printf("\x1B[32m%s\x1B[0m\n", "OK!"); fflush(stdout); // Concurrency for (int i = 0; i < NUM_STREAMS; ++i) { cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, i); } // Memory // VBA in/out const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.in[i], vba_size_bytes)); ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.out[i], vba_size_bytes)); } // VBA Profiles const size_t profile_size_bytes = sizeof(AcReal) * max(device_config.int_params[AC_mx], max(device_config.int_params[AC_my], device_config.int_params[AC_mz])); for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) { ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.profiles[i], profile_size_bytes)); } // Reductions ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config))); ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal))); // Device constants acDeviceLoadDefaultUniforms(device); acDeviceLoadMeshInfo(device, device_config); #if AC_VERBOSE printf("Created device %d (%p)\n", device->id, device); #endif *device_handle = device; // Autoptimize acDeviceAutoOptimize(device); return AC_SUCCESS; } AcResult acDeviceDestroy(Device device) { cudaSetDevice(device->id); #if AC_VERBOSE printf("Destroying device %d (%p)\n", device->id, device); #endif acDeviceSynchronizeStream(device, STREAM_ALL); // Memory for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { cudaFree(device->vba.in[i]); cudaFree(device->vba.out[i]); } for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) { cudaFree(device->vba.profiles[i]); } cudaFree(device->reduce_scratchpad); cudaFree(device->reduce_result); // Concurrency for (int i = 0; i < NUM_STREAMS; ++i) { cudaStreamDestroy(device->streams[i]); } // Destroy Device free(device); return AC_SUCCESS; } AcResult acDeviceSwapBuffers(const Device device) { cudaSetDevice(device->id); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { AcReal* tmp = device->vba.in[i]; device->vba.in[i] = device->vba.out[i]; device->vba.out[i] = tmp; } return AC_SUCCESS; } AcResult acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarArrayHandle handle, const size_t start, const AcReal* data, const size_t num) { cudaSetDevice(device->id); if (handle >= NUM_SCALARARRAY_HANDLES || !NUM_SCALARARRAY_HANDLES) return AC_FAILURE; ERRCHK((int)(start + num) <= max(device->local_config.int_params[AC_mx], max(device->local_config.int_params[AC_my], device->local_config.int_params[AC_mz]))); ERRCHK_ALWAYS(handle < NUM_SCALARARRAY_HANDLES); ERRCHK_CUDA(cudaMemcpyAsync(&device->vba.profiles[handle][start], data, sizeof(data[0]) * num, cudaMemcpyHostToDevice, device->streams[stream])); return AC_SUCCESS; } AcResult acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh, const VertexBufferHandle vtxbuf_handle, const int3 src, const int3 dst, const int num_vertices) { cudaSetDevice(device->id); const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info); const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config); const AcReal* src_ptr = &host_mesh.vertex_buffer[vtxbuf_handle][src_idx]; AcReal* dst_ptr = &device->vba.in[vtxbuf_handle][dst_idx]; const size_t bytes = num_vertices * sizeof(src_ptr[0]); ERRCHK_CUDA( // cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyHostToDevice, device->streams[stream]) // ); return AC_SUCCESS; } AcResult acDeviceLoadMeshWithOffset(const Device device, const Stream stream, const AcMesh host_mesh, const int3 src, const int3 dst, const int num_vertices) { WARNING("This function is deprecated"); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, (VertexBufferHandle)i, src, dst, num_vertices); } return AC_SUCCESS; } AcResult acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh, const VertexBufferHandle vtxbuf_handle) { const int3 src = (int3){0, 0, 0}; const int3 dst = src; const size_t num_vertices = acVertexBufferSize(device->local_config); acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, vtxbuf_handle, src, dst, num_vertices); return AC_SUCCESS; } AcResult acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acDeviceLoadVertexBuffer(device, stream, host_mesh, (VertexBufferHandle)i); } return AC_SUCCESS; } AcResult acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream, const VertexBufferHandle vtxbuf_handle, const int3 src, const int3 dst, const int num_vertices, AcMesh* host_mesh) { cudaSetDevice(device->id); const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config); const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info); const AcReal* src_ptr = &device->vba.in[vtxbuf_handle][src_idx]; AcReal* dst_ptr = &host_mesh->vertex_buffer[vtxbuf_handle][dst_idx]; const size_t bytes = num_vertices * sizeof(src_ptr[0]); ERRCHK_CUDA( // cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyDeviceToHost, device->streams[stream]) // ); return AC_SUCCESS; } AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src, const int3 dst, const int num_vertices, AcMesh* host_mesh) { WARNING("This function is deprecated"); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acDeviceStoreVertexBufferWithOffset(device, stream, (VertexBufferHandle)i, src, dst, num_vertices, host_mesh); } return AC_SUCCESS; } AcResult acDeviceStoreVertexBuffer(const Device device, const Stream stream, const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh) { int3 src = (int3){0, 0, 0}; int3 dst = src; const size_t num_vertices = acVertexBufferSize(device->local_config); acDeviceStoreVertexBufferWithOffset(device, stream, vtxbuf_handle, src, dst, num_vertices, host_mesh); return AC_SUCCESS; } AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acDeviceStoreVertexBuffer(device, stream, (VertexBufferHandle)i, host_mesh); } return AC_SUCCESS; } AcResult acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream, const VertexBufferHandle vtxbuf_handle, const int3 src, const int3 dst, const int num_vertices, Device dst_device) { cudaSetDevice(src_device->id); const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config); const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config); const AcReal* src_ptr = &src_device->vba.in[vtxbuf_handle][src_idx]; AcReal* dst_ptr = &dst_device->vba.in[vtxbuf_handle][dst_idx]; const size_t bytes = num_vertices * sizeof(src_ptr[0]); ERRCHK_CUDA(cudaMemcpyPeerAsync(dst_ptr, dst_device->id, src_ptr, src_device->id, bytes, src_device->streams[stream])); return AC_SUCCESS; } AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, const int3 src, const int3 dst, const int num_vertices, Device dst_device) { WARNING("This function is deprecated"); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acDeviceTransferVertexBufferWithOffset(src_device, stream, (VertexBufferHandle)i, src, dst, num_vertices, dst_device); } return AC_SUCCESS; } AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream, const VertexBufferHandle vtxbuf_handle, Device dst_device) { int3 src = (int3){0, 0, 0}; int3 dst = src; const size_t num_vertices = acVertexBufferSize(src_device->local_config); acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst, num_vertices, dst_device); return AC_SUCCESS; } AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device) { WARNING("This function is deprecated"); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acDeviceTransferVertexBuffer(src_device, stream, (VertexBufferHandle)i, dst_device); } return AC_SUCCESS; } AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number, const int3 start, const int3 end, const AcReal dt) { cudaSetDevice(device->id); acDeviceLoadScalarUniform(device, stream, AC_dt, dt); return acKernelIntegrateSubstep(device->streams[stream], step_number, start, end, device->vba); } AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream, const VertexBufferHandle vtxbuf_handle, const int3 start, const int3 end) { cudaSetDevice(device->id); return acKernelPeriodicBoundconds(device->streams[stream], start, end, device->vba.in[vtxbuf_handle]); } AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start, const int3 end) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acDevicePeriodicBoundcondStep(device, stream, (VertexBufferHandle)i, start, end); } return AC_SUCCESS; } AcResult acDeviceGeneralBoundcondStep(const Device device, const Stream stream, const VertexBufferHandle vtxbuf_handle, const int3 start, const int3 end, const int bound_direction) { cudaSetDevice(device->id); return acKernelGeneralBoundconds(device->streams[stream], start, end, device->vba.in[vtxbuf_handle], bound_direction); } AcResult acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start, const int3 end, const int bound_direction) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acDeviceGeneralBoundcondStep(device, stream, (VertexBufferHandle)i, start, end, bound_direction); } return AC_SUCCESS; } AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf_handle, AcReal* result) { cudaSetDevice(device->id); const int3 start = (int3){device->local_config.int_params[AC_nx_min], device->local_config.int_params[AC_ny_min], device->local_config.int_params[AC_nz_min]}; const int3 end = (int3){device->local_config.int_params[AC_nx_max], device->local_config.int_params[AC_ny_max], device->local_config.int_params[AC_nz_max]}; *result = acKernelReduceScal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle], device->reduce_scratchpad, device->reduce_result); return AC_SUCCESS; } AcResult acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, const VertexBufferHandle vtxbuf2, AcReal* result) { cudaSetDevice(device->id); const int3 start = (int3){device->local_config.int_params[AC_nx_min], device->local_config.int_params[AC_ny_min], device->local_config.int_params[AC_nz_min]}; const int3 end = (int3){device->local_config.int_params[AC_nx_max], device->local_config.int_params[AC_ny_max], device->local_config.int_params[AC_nz_max]}; *result = acKernelReduceVec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0], device->vba.in[vtxbuf1], device->vba.in[vtxbuf2], device->reduce_scratchpad, device->reduce_result); return AC_SUCCESS; } AcResult acDeviceReduceVecScal(const Device device, const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, const VertexBufferHandle vtxbuf2, const VertexBufferHandle vtxbuf3, AcReal* result) { cudaSetDevice(device->id); const int3 start = (int3){device->local_config.int_params[AC_nx_min], device->local_config.int_params[AC_ny_min], device->local_config.int_params[AC_nz_min]}; const int3 end = (int3){device->local_config.int_params[AC_nx_max], device->local_config.int_params[AC_ny_max], device->local_config.int_params[AC_nz_max]}; *result = acKernelReduceVecScal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0], device->vba.in[vtxbuf1], device->vba.in[vtxbuf2], device->vba.in[vtxbuf3], device->reduce_scratchpad, device->reduce_result); return AC_SUCCESS; } #if AC_MPI_ENABLED /** Quick overview of the MPI implementation: The halo is partitioned into segments. The first coordinate of a segment is b0. The array containing multiple b0s is called... "b0s". Each b0 maps to an index in the computational domain of some neighboring process a0. We have a0 = mod(b0 - nghost, nn) + nghost. Intuitively, we 1) Transform b0 into a coordinate system where (0, 0, 0) is the first index in the comp domain. 2) Wrap the transformed b0 around nn (comp domain) 3) Transform b0 back to a coordinate system where (0, 0, 0) is the first index in the ghost zone struct PackedData is used for packing and unpacking. Holds the actual data in the halo partition struct CommData holds multiple PackedDatas for sending and receiving halo partitions struct Grid contains information about the local GPU device, decomposition, the total mesh dimensions and CommDatas Basic steps: 1) Distribute the mesh among ranks 2) Integrate & communicate - start inner integration and at the same time, pack halo data and send it to neighbors - once all halo data has been received, unpack and do outer integration - sync and start again 3) Gather the mesh to rank 0 for postprocessing */ #include #include typedef struct { uint64_t x, y, z; } uint3_64; static uint3_64 operator+(const uint3_64& a, const uint3_64& b) { return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z}; } static int3 make_int3(const uint3_64 a) { return (int3){(int)a.x, (int)a.y, (int)a.z}; } static uint64_t mod(const int a, const int b) { const int r = a % b; return r < 0 ? r + b : r; } static uint3_64 morton3D(const uint64_t pid) { uint64_t i, j, k; i = j = k = 0; if (MPI_DECOMPOSITION_AXES == 3) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << 3 * bit; k |= ((pid & (mask << 0)) >> 2 * bit) >> 0; j |= ((pid & (mask << 1)) >> 2 * bit) >> 1; i |= ((pid & (mask << 2)) >> 2 * bit) >> 2; } } // Just a quick copy/paste for other decomp dims else if (MPI_DECOMPOSITION_AXES == 2) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << 2 * bit; j |= ((pid & (mask << 0)) >> 1 * bit) >> 0; k |= ((pid & (mask << 1)) >> 1 * bit) >> 1; } } else if (MPI_DECOMPOSITION_AXES == 1) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << 1 * bit; k |= ((pid & (mask << 0)) >> 0 * bit) >> 0; } } else { fprintf(stderr, "Invalid MPI_DECOMPOSITION_AXES\n"); ERRCHK_ALWAYS(0); } return (uint3_64){i, j, k}; } static uint64_t morton1D(const uint3_64 pid) { uint64_t i = 0; if (MPI_DECOMPOSITION_AXES == 3) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << bit; i |= ((pid.z & mask) << 0) << 2 * bit; i |= ((pid.y & mask) << 1) << 2 * bit; i |= ((pid.x & mask) << 2) << 2 * bit; } } else if (MPI_DECOMPOSITION_AXES == 2) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << bit; i |= ((pid.y & mask) << 0) << 1 * bit; i |= ((pid.z & mask) << 1) << 1 * bit; } } else if (MPI_DECOMPOSITION_AXES == 1) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << bit; i |= ((pid.z & mask) << 0) << 0 * bit; } } else { fprintf(stderr, "Invalid MPI_DECOMPOSITION_AXES\n"); ERRCHK_ALWAYS(0); } return i; } static uint3_64 decompose(const uint64_t target) { // This is just so beautifully elegant. Complex and efficient decomposition // in just one line of code. uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1}; ERRCHK_ALWAYS(p.x * p.y * p.z == target); return p; } static uint3_64 wrap(const int3 i, const uint3_64 n) { return (uint3_64){ mod(i.x, n.x), mod(i.y, n.y), mod(i.z, n.z), }; } static int getPid(const int3 pid_raw, const uint3_64 decomp) { const uint3_64 pid = wrap(pid_raw, decomp); return (int)morton1D(pid); } static int3 getPid3D(const uint64_t pid, const uint3_64 decomp) { const uint3_64 pid3D = morton3D(pid); ERRCHK_ALWAYS(getPid(make_int3(pid3D), decomp) == (int)pid); return (int3){(int)pid3D.x, (int)pid3D.y, (int)pid3D.z}; } /** Assumes that contiguous pids are on the same node and there is one process per GPU. */ static inline bool onTheSameNode(const uint64_t pid_a, const uint64_t pid_b) { int devices_per_node = -1; cudaGetDeviceCount(&devices_per_node); const uint64_t node_a = pid_a / devices_per_node; const uint64_t node_b = pid_b / devices_per_node; return node_a == node_b; } static PackedData acCreatePackedData(const int3 dims) { PackedData data = {}; data.dims = dims; const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES; ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes)); #if MPI_USE_CUDA_DRIVER_PINNING ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes)); unsigned int flag = 1; CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)data.data_pinned); ERRCHK_ALWAYS(retval == CUDA_SUCCESS); #else ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes)); // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly // slower than pinned (38 ms vs. 125 ms) #endif // USE_CUDA_DRIVER_PINNING return data; } static AcResult acDestroyPackedData(PackedData* data) { cudaFree(data->data_pinned); data->dims = (int3){-1, -1, -1}; cudaFree(data->data); data->data = NULL; return AC_SUCCESS; } #if MPI_GPUDIRECT_DISABLED static PackedData acCreatePackedDataHost(const int3 dims) { PackedData data = {}; data.dims = dims; const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES; data.data = (AcReal*)malloc(bytes); ERRCHK_ALWAYS(data.data); return data; } static AcResult acDestroyPackedDataHost(PackedData* data) { data->dims = (int3){-1, -1, -1}; free(data->data); data->data = NULL; return AC_SUCCESS; } static void acTransferPackedDataToHost(const Device device, const cudaStream_t stream, const PackedData ddata, PackedData* hdata) { cudaSetDevice(device->id); const size_t bytes = ddata.dims.x * ddata.dims.y * ddata.dims.z * sizeof(ddata.data[0]) * NUM_VTXBUF_HANDLES; ERRCHK_CUDA(cudaMemcpyAsync(hdata->data, ddata.data, bytes, cudaMemcpyDeviceToHost, stream)); } static void acTransferPackedDataToDevice(const Device device, const cudaStream_t stream, const PackedData hdata, PackedData* ddata) { cudaSetDevice(device->id); const size_t bytes = hdata.dims.x * hdata.dims.y * hdata.dims.z * sizeof(hdata.data[0]) * NUM_VTXBUF_HANDLES; ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, hdata.data, bytes, cudaMemcpyHostToDevice, stream)); } #endif // MPI_GPUDIRECT_DISABLED static void acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata) { cudaSetDevice(device->id); // TODO sync stream ddata->pinned = true; const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) * NUM_VTXBUF_HANDLES; ERRCHK_CUDA(cudaMemcpyAsync(ddata->data_pinned, ddata->data, bytes, cudaMemcpyDefault, stream)); } static void acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata) { if (!ddata->pinned) // Unpin iff the data was pinned previously return; cudaSetDevice(device->id); // TODO sync stream ddata->pinned = false; const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) * NUM_VTXBUF_HANDLES; ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyDefault, stream)); } // TODO: do with packed data static AcResult acDeviceDistributeMeshMPI(const AcMesh src, const uint3_64 decomposition, AcMesh* dst) { MPI_Barrier(MPI_COMM_WORLD); #if AC_VERBOSE printf("Distributing mesh...\n"); fflush(stdout); #endif MPI_Datatype datatype = MPI_FLOAT; if (sizeof(AcReal) == 8) datatype = MPI_DOUBLE; int pid, nprocs; MPI_Comm_rank(MPI_COMM_WORLD, &pid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); ERRCHK_ALWAYS(dst); // Submesh nn const int3 nn = (int3){ dst->info.int_params[AC_nx], dst->info.int_params[AC_ny], dst->info.int_params[AC_nz], }; // Send to self if (pid == 0) { for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { // For pencils for (int k = NGHOST; k < NGHOST + nn.z; ++k) { for (int j = NGHOST; j < NGHOST + nn.y; ++j) { const int i = NGHOST; const int count = nn.x; const int src_idx = acVertexBufferIdx(i, j, k, src.info); const int dst_idx = acVertexBufferIdx(i, j, k, dst->info); memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], // &src.vertex_buffer[vtxbuf][src_idx], // count * sizeof(src.vertex_buffer[i][0])); } } } } for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { // For pencils for (int k = NGHOST; k < NGHOST + nn.z; ++k) { for (int j = NGHOST; j < NGHOST + nn.y; ++j) { const int i = NGHOST; const int count = nn.x; if (pid != 0) { const int dst_idx = acVertexBufferIdx(i, j, k, dst->info); // Recv MPI_Status status; MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, 0, 0, MPI_COMM_WORLD, &status); } else { for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) { const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition); const int src_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, // j + tgt_pid3d.y * nn.y, // k + tgt_pid3d.z * nn.z, // src.info); // Send MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, tgt_pid, 0, MPI_COMM_WORLD); } } } } } return AC_SUCCESS; } // TODO: do with packed data static AcResult acDeviceGatherMeshMPI(const AcMesh src, const uint3_64 decomposition, AcMesh* dst) { MPI_Barrier(MPI_COMM_WORLD); #if AC_VERBOSE printf("Gathering mesh...\n"); fflush(stdout); #endif MPI_Datatype datatype = MPI_FLOAT; if (sizeof(AcReal) == 8) datatype = MPI_DOUBLE; int pid, nprocs; MPI_Comm_rank(MPI_COMM_WORLD, &pid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); if (pid == 0) ERRCHK_ALWAYS(dst); // Submesh nn const int3 nn = (int3){ src.info.int_params[AC_nx], src.info.int_params[AC_ny], src.info.int_params[AC_nz], }; // Submesh mm const int3 mm = (int3){ src.info.int_params[AC_mx], src.info.int_params[AC_my], src.info.int_params[AC_mz], }; // Send to self if (pid == 0) { for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { // For pencils for (int k = 0; k < mm.z; ++k) { for (int j = 0; j < mm.y; ++j) { const int i = 0; const int count = mm.x; const int src_idx = acVertexBufferIdx(i, j, k, src.info); const int dst_idx = acVertexBufferIdx(i, j, k, dst->info); memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], // &src.vertex_buffer[vtxbuf][src_idx], // count * sizeof(src.vertex_buffer[i][0])); } } } } for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { // For pencils for (int k = 0; k < mm.z; ++k) { for (int j = 0; j < mm.y; ++j) { const int i = 0; const int count = mm.x; if (pid != 0) { // Send const int src_idx = acVertexBufferIdx(i, j, k, src.info); MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, 0, 0, MPI_COMM_WORLD); } else { for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) { const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition); const int dst_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, // j + tgt_pid3d.y * nn.y, // k + tgt_pid3d.z * nn.z, // dst->info); // Recv MPI_Status status; MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, tgt_pid, 0, MPI_COMM_WORLD, &status); } } } } } return AC_SUCCESS; } typedef struct { PackedData* srcs; PackedData* dsts; #if MPI_GPUDIRECT_DISABLED PackedData* srcs_host; PackedData* dsts_host; #endif int3 dims; size_t count; cudaStream_t* streams; MPI_Request* send_reqs; MPI_Request* recv_reqs; } CommData; static CommData acCreateCommData(const Device device, const int3 dims, const size_t count) { cudaSetDevice(device->id); CommData data = {}; data.srcs = (PackedData*)malloc(count * sizeof(PackedData)); data.dsts = (PackedData*)malloc(count * sizeof(PackedData)); data.dims = dims; data.count = count; data.streams = (cudaStream_t*)malloc(count * sizeof(cudaStream_t)); data.send_reqs = (MPI_Request*)malloc(count * sizeof(MPI_Request)); data.recv_reqs = (MPI_Request*)malloc(count * sizeof(MPI_Request)); ERRCHK_ALWAYS(data.srcs); ERRCHK_ALWAYS(data.dsts); ERRCHK_ALWAYS(data.send_reqs); ERRCHK_ALWAYS(data.recv_reqs); #if MPI_GPUDIRECT_DISABLED data.srcs_host = (PackedData*)malloc(count * sizeof(PackedData)); data.dsts_host = (PackedData*)malloc(count * sizeof(PackedData)); ERRCHK_ALWAYS(data.srcs_host); ERRCHK_ALWAYS(data.dsts_host); #endif for (size_t i = 0; i < count; ++i) { data.srcs[i] = acCreatePackedData(dims); data.dsts[i] = acCreatePackedData(dims); #if MPI_GPUDIRECT_DISABLED data.srcs_host[i] = acCreatePackedDataHost(dims); data.dsts_host[i] = acCreatePackedDataHost(dims); #endif int low_prio, high_prio; cudaDeviceGetStreamPriorityRange(&low_prio, &high_prio); cudaStreamCreateWithPriority(&data.streams[i], cudaStreamNonBlocking, high_prio); } return data; } static void acDestroyCommData(const Device device, CommData* data) { cudaSetDevice(device->id); for (size_t i = 0; i < data->count; ++i) { acDestroyPackedData(&data->srcs[i]); acDestroyPackedData(&data->dsts[i]); #if MPI_GPUDIRECT_DISABLED acDestroyPackedDataHost(&data->srcs_host[i]); acDestroyPackedDataHost(&data->dsts_host[i]); #endif cudaStreamDestroy(data->streams[i]); } free(data->srcs); free(data->dsts); #if MPI_GPUDIRECT_DISABLED free(data->srcs_host); free(data->dsts_host); #endif free(data->streams); free(data->send_reqs); free(data->recv_reqs); data->count = -1; data->dims = (int3){-1, -1, -1}; } static void acSyncCommData(const CommData data) { for (size_t i = 0; i < data.count; ++i) cudaStreamSynchronize(data.streams[i]); } static int3 mod(const int3 a, const int3 n) { return (int3){(int)mod(a.x, n.x), (int)mod(a.y, n.y), (int)mod(a.z, n.z)}; } static void acPackCommData(const Device device, const int3* b0s, CommData* data) { cudaSetDevice(device->id); const int3 nn = (int3){ device->local_config.int_params[AC_nx], device->local_config.int_params[AC_ny], device->local_config.int_params[AC_nz], }; const int3 nghost = (int3){NGHOST, NGHOST, NGHOST}; for (size_t i = 0; i < data->count; ++i) { const int3 a0 = mod(b0s[i] - nghost, nn) + nghost; acKernelPackData(data->streams[i], device->vba, a0, data->srcs[i]); } } static void acUnpackCommData(const Device device, const int3* b0s, CommData* data) { cudaSetDevice(device->id); for (size_t i = 0; i < data->count; ++i) acKernelUnpackData(data->streams[i], data->dsts[i], b0s[i], device->vba); } #if MPI_GPUDIRECT_DISABLED static void acTransferCommDataToHost(const Device device, CommData* data) { cudaSetDevice(device->id); for (size_t i = 0; i < data->count; ++i) acTransferPackedDataToHost(device, data->streams[i], data->srcs[i], &data->srcs_host[i]); } static void acTransferCommDataToDevice(const Device device, CommData* data) { cudaSetDevice(device->id); for (size_t i = 0; i < data->count; ++i) acTransferPackedDataToDevice(device, data->streams[i], data->dsts_host[i], &data->dsts[i]); } #endif static inline void acPinCommData(const Device device, CommData* data) { cudaSetDevice(device->id); for (size_t i = 0; i < data->count; ++i) acPinPackedData(device, data->streams[i], &data->srcs[i]); } static void acUnpinCommData(const Device device, CommData* data) { cudaSetDevice(device->id); // Clear pin flags from src for (size_t i = 0; i < data->count; ++i) data->srcs[i].pinned = false; // Transfer from pinned to gmem for (size_t i = 0; i < data->count; ++i) acUnpinPackedData(device, data->streams[i], &data->dsts[i]); } static AcResult acTransferCommData(const Device device, // const int3* b0s, // Halo partition coordinates CommData* data) { cudaSetDevice(device->id); MPI_Datatype datatype = MPI_FLOAT; if (sizeof(AcReal) == 8) datatype = MPI_DOUBLE; int nprocs, pid; MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &pid); const uint3_64 decomp = decompose(nprocs); const int3 nn = (int3){ device->local_config.int_params[AC_nx], device->local_config.int_params[AC_ny], device->local_config.int_params[AC_nz], }; const int3 pid3d = getPid3D(pid, decomp); const int3 dims = data->dims; const size_t blockcount = data->count; const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES; for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) { const int3 b0 = b0s[b0_idx]; const int3 neighbor = (int3){ b0.x < NGHOST ? -1 : b0.x >= NGHOST + nn.x ? 1 : 0, b0.y < NGHOST ? -1 : b0.y >= NGHOST + nn.y ? 1 : 0, b0.z < NGHOST ? -1 : b0.z >= NGHOST + nn.z ? 1 : 0, }; const int npid = getPid(pid3d + neighbor, decomp); PackedData* dst = &data->dsts[b0_idx]; if (onTheSameNode(pid, npid) || !MPI_USE_PINNED) { MPI_Irecv(dst->data, count, datatype, npid, b0_idx, // MPI_COMM_WORLD, &data->recv_reqs[b0_idx]); dst->pinned = false; } else { MPI_Irecv(dst->data_pinned, count, datatype, npid, b0_idx, // MPI_COMM_WORLD, &data->recv_reqs[b0_idx]); dst->pinned = true; } } for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) { const int3 b0 = b0s[b0_idx]; const int3 neighbor = (int3){ b0.x < NGHOST ? -1 : b0.x >= NGHOST + nn.x ? 1 : 0, b0.y < NGHOST ? -1 : b0.y >= NGHOST + nn.y ? 1 : 0, b0.z < NGHOST ? -1 : b0.z >= NGHOST + nn.z ? 1 : 0, }; const int npid = getPid(pid3d - neighbor, decomp); PackedData* src = &data->srcs[b0_idx]; if (onTheSameNode(pid, npid) || !MPI_USE_PINNED) { cudaStreamSynchronize(data->streams[b0_idx]); MPI_Isend(src->data, count, datatype, npid, b0_idx, // MPI_COMM_WORLD, &data->send_reqs[b0_idx]); } else { acPinPackedData(device, data->streams[b0_idx], src); cudaStreamSynchronize(data->streams[b0_idx]); MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, // MPI_COMM_WORLD, &data->send_reqs[b0_idx]); } } return AC_SUCCESS; } static void acTransferCommDataWait(const CommData data) { MPI_Waitall(data.count, data.recv_reqs, MPI_STATUSES_IGNORE); MPI_Waitall(data.count, data.send_reqs, MPI_STATUSES_IGNORE); } typedef struct { Device device; AcMesh submesh; uint3_64 decomposition; bool initialized; int3 nn; CommData corner_data; CommData edgex_data; CommData edgey_data; CommData edgez_data; CommData sidexy_data; CommData sidexz_data; CommData sideyz_data; // int comm_cart; } Grid; static Grid grid = {}; AcResult acGridSynchronizeStream(const Stream stream) { ERRCHK(grid.initialized); acDeviceSynchronizeStream(grid.device, stream); MPI_Barrier(MPI_COMM_WORLD); return AC_SUCCESS; } AcResult acGridInit(const AcMeshInfo info) { ERRCHK(!grid.initialized); // Check that MPI is initialized int nprocs, pid; MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &pid); char processor_name[MPI_MAX_PROCESSOR_NAME]; int name_len; MPI_Get_processor_name(processor_name, &name_len); // Decompose AcMeshInfo submesh_info = info; const uint3_64 decomposition = decompose(nprocs); const int3 pid3d = getPid3D(pid, decomposition); MPI_Barrier(MPI_COMM_WORLD); printf("Processor %s. Process %d of %d: (%d, %d, %d)\n", processor_name, pid, nprocs, pid3d.x, pid3d.y, pid3d.z); printf("Decomposition: %lu, %lu, %lu\n", decomposition.x, decomposition.y, decomposition.z); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD); ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0); ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0); ERRCHK_ALWAYS(info.int_params[AC_nz] % decomposition.z == 0); const int submesh_nx = info.int_params[AC_nx] / decomposition.x; const int submesh_ny = info.int_params[AC_ny] / decomposition.y; const int submesh_nz = info.int_params[AC_nz] / decomposition.z; submesh_info.int_params[AC_nx] = submesh_nx; submesh_info.int_params[AC_ny] = submesh_ny; submesh_info.int_params[AC_nz] = submesh_nz; submesh_info.int3_params[AC_global_grid_n] = (int3){ info.int_params[AC_nx], info.int_params[AC_ny], info.int_params[AC_nz], }; submesh_info.int3_params[AC_multigpu_offset] = pid3d * (int3){submesh_nx, submesh_ny, submesh_nz}; acUpdateBuiltinParams(&submesh_info); // GPU alloc int devices_per_node = -1; cudaGetDeviceCount(&devices_per_node); Device device; acDeviceCreate(pid % devices_per_node, submesh_info, &device); // CPU alloc AcMesh submesh; acMeshCreate(submesh_info, &submesh); // Setup the global grid structure grid.device = device; grid.submesh = submesh; grid.decomposition = decomposition; grid.initialized = true; // Configure const int3 nn = (int3){ device->local_config.int_params[AC_nx], device->local_config.int_params[AC_ny], device->local_config.int_params[AC_nz], }; // Create CommData // We have 8 corners, 12 edges, and 6 sides // // For simplicity's sake all data blocks inside a single CommData struct // have the same dimensions. grid.nn = nn; grid.corner_data = acCreateCommData(device, (int3){NGHOST, NGHOST, NGHOST}, 8); grid.edgex_data = acCreateCommData(device, (int3){nn.x, NGHOST, NGHOST}, 4); grid.edgey_data = acCreateCommData(device, (int3){NGHOST, nn.y, NGHOST}, 4); grid.edgez_data = acCreateCommData(device, (int3){NGHOST, NGHOST, nn.z}, 4); grid.sidexy_data = acCreateCommData(device, (int3){nn.x, nn.y, NGHOST}, 2); grid.sidexz_data = acCreateCommData(device, (int3){nn.x, NGHOST, nn.z}, 2); grid.sideyz_data = acCreateCommData(device, (int3){NGHOST, nn.y, nn.z}, 2); acGridSynchronizeStream(STREAM_ALL); return AC_SUCCESS; } AcResult acGridQuit(void) { ERRCHK(grid.initialized); acGridSynchronizeStream(STREAM_ALL); acDestroyCommData(grid.device, &grid.corner_data); acDestroyCommData(grid.device, &grid.edgex_data); acDestroyCommData(grid.device, &grid.edgey_data); acDestroyCommData(grid.device, &grid.edgez_data); acDestroyCommData(grid.device, &grid.sidexy_data); acDestroyCommData(grid.device, &grid.sidexz_data); acDestroyCommData(grid.device, &grid.sideyz_data); grid.initialized = false; grid.decomposition = (uint3_64){0, 0, 0}; acMeshDestroy(&grid.submesh); acDeviceDestroy(grid.device); acGridSynchronizeStream(STREAM_ALL); return AC_SUCCESS; } AcResult acGridLoadScalarUniform(const Stream stream, const AcRealParam param, const AcReal value) { ERRCHK(grid.initialized); acGridSynchronizeStream(stream); #if AC_DOUBLE_PRECISION == 1 MPI_Datatype datatype = MPI_DOUBLE; #else MPI_Datatype datatype = MPI_FLOAT; #endif const int root_proc = 0; AcReal buffer = value; MPI_Bcast(&buffer, 1, datatype, root_proc, MPI_COMM_WORLD); acDeviceLoadScalarUniform(grid.device, stream, param, buffer); return AC_SUCCESS; } /** */ AcResult acGridLoadVectorUniform(const Stream stream, const AcReal3Param param, const AcReal3 value) { ERRCHK(grid.initialized); acGridSynchronizeStream(stream); #if AC_DOUBLE_PRECISION == 1 MPI_Datatype datatype = MPI_DOUBLE; #else MPI_Datatype datatype = MPI_FLOAT; #endif const int root_proc = 0; AcReal3 buffer = value; MPI_Bcast(&buffer, 3, datatype, root_proc, MPI_COMM_WORLD); acDeviceLoadVectorUniform(grid.device, stream, param, buffer); return AC_SUCCESS; } AcResult acGridLoadMesh(const Stream stream, const AcMesh host_mesh) { ERRCHK(grid.initialized); acGridSynchronizeStream(stream); acDeviceDistributeMeshMPI(host_mesh, grid.decomposition, &grid.submesh); acDeviceLoadMesh(grid.device, stream, grid.submesh); return AC_SUCCESS; } AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh) { ERRCHK(grid.initialized); acGridSynchronizeStream(stream); acDeviceStoreMesh(grid.device, stream, &grid.submesh); acGridSynchronizeStream(stream); acDeviceGatherMeshMPI(grid.submesh, grid.decomposition, host_mesh); return AC_SUCCESS; } /* // Unused AcResult acGridIntegratePipelined(const Stream stream, const AcReal dt) { ERRCHK(grid.initialized); acGridLoadScalarUniform(stream, AC_dt, dt); acGridSynchronizeStream(stream); const Device device = grid.device; const int3 nn = grid.nn; #if MPI_INCL_CORNERS CommData corner_data = grid.corner_data; // Do not rm: required for corners #endif // MPI_INCL_CORNERS CommData edgex_data = grid.edgex_data; CommData edgey_data = grid.edgey_data; CommData edgez_data = grid.edgez_data; CommData sidexy_data = grid.sidexy_data; CommData sidexz_data = grid.sidexz_data; CommData sideyz_data = grid.sideyz_data; // Corners #if MPI_INCL_CORNERS // Do not rm: required for corners const int3 corner_b0s[] = { (int3){0, 0, 0}, (int3){NGHOST + nn.x, 0, 0}, (int3){0, NGHOST + nn.y, 0}, (int3){0, 0, NGHOST + nn.z}, (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, (int3){0, NGHOST + nn.y, NGHOST + nn.z}, (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, }; #endif // MPI_INCL_CORNERS // Edges X const int3 edgex_b0s[] = { (int3){NGHOST, 0, 0}, (int3){NGHOST, NGHOST + nn.y, 0}, (int3){NGHOST, 0, NGHOST + nn.z}, (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z}, }; // Edges Y const int3 edgey_b0s[] = { (int3){0, NGHOST, 0}, (int3){NGHOST + nn.x, NGHOST, 0}, (int3){0, NGHOST, NGHOST + nn.z}, (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z}, }; // Edges Z const int3 edgez_b0s[] = { (int3){0, 0, NGHOST}, (int3){NGHOST + nn.x, 0, NGHOST}, (int3){0, NGHOST + nn.y, NGHOST}, (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST}, }; // Sides XY const int3 sidexy_b0s[] = { (int3){NGHOST, NGHOST, 0}, // (int3){NGHOST, NGHOST, NGHOST + nn.z}, // }; // Sides XZ const int3 sidexz_b0s[] = { (int3){NGHOST, 0, NGHOST}, // (int3){NGHOST, NGHOST + nn.y, NGHOST}, // }; // Sides YZ const int3 sideyz_b0s[] = { (int3){0, NGHOST, NGHOST}, // (int3){NGHOST + nn.x, NGHOST, NGHOST}, // }; for (int isubstep = 0; isubstep < 3; ++isubstep) { acDeviceSynchronizeStream(device, STREAM_ALL); MPI_Barrier(MPI_COMM_WORLD); #if MPI_COMPUTE_ENABLED acPackCommData(device, sidexy_b0s, &sidexy_data); acPackCommData(device, sidexz_b0s, &sidexz_data); acPackCommData(device, sideyz_b0s, &sideyz_data); #endif // MPI_COMPUTE_ENABLED #if MPI_COMM_ENABLED acTransferCommData(device, sidexy_b0s, &sidexy_data); acTransferCommData(device, sidexz_b0s, &sidexz_data); acTransferCommData(device, sideyz_b0s, &sideyz_data); #endif // MPI_COMM_ENABLED #if MPI_COMPUTE_ENABLED //////////// INNER INTEGRATION ////////////// { const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; const int3 m2 = nn; acKernelIntegrateSubstep(device->streams[STREAM_16], isubstep, m1, m2, device->vba); } acPackCommData(device, edgex_b0s, &edgex_data); acPackCommData(device, edgey_b0s, &edgey_data); acPackCommData(device, edgez_b0s, &edgez_data); #endif // MPI_COMPUTE_ENABLED #if MPI_COMM_ENABLED acTransferCommDataWait(sidexy_data); acUnpinCommData(device, &sidexy_data); acTransferCommDataWait(sidexz_data); acUnpinCommData(device, &sidexz_data); acTransferCommDataWait(sideyz_data); acUnpinCommData(device, &sideyz_data); acTransferCommData(device, edgex_b0s, &edgex_data); acTransferCommData(device, edgey_b0s, &edgey_data); acTransferCommData(device, edgez_b0s, &edgez_data); #endif // MPI_COMM_ENABLED #if MPI_COMPUTE_ENABLED #if MPI_INCL_CORNERS acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners #endif // MPI_INCL_CORNERS acUnpackCommData(device, sidexy_b0s, &sidexy_data); acUnpackCommData(device, sidexz_b0s, &sidexz_data); acUnpackCommData(device, sideyz_b0s, &sideyz_data); #endif // MPI_COMPUTE_ENABLED #if MPI_COMM_ENABLED acTransferCommDataWait(edgex_data); acUnpinCommData(device, &edgex_data); acTransferCommDataWait(edgey_data); acUnpinCommData(device, &edgey_data); acTransferCommDataWait(edgez_data); acUnpinCommData(device, &edgez_data); #if MPI_INCL_CORNERS acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners #endif // MPI_INCL_CORNERS #endif // MPI_COMM_ENABLED #if MPI_COMPUTE_ENABLED acUnpackCommData(device, edgex_b0s, &edgex_data); acUnpackCommData(device, edgey_b0s, &edgey_data); acUnpackCommData(device, edgez_b0s, &edgez_data); #endif // MPI_COMPUTE_ENABLED #if MPI_COMM_ENABLED #if MPI_INCL_CORNERS acTransferCommDataWait(corner_data); // Do not rm: required for corners acUnpinCommData(device, &corner_data); // Do not rm: required for corners #endif // MPI_INCL_CORNERS #endif // MPI_COMM_ENABLED #if MPI_COMPUTE_ENABLED #if MPI_INCL_CORNERS acUnpackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners #endif // MPI_INCL_CORNERS #endif // MPI_COMPUTE_ENABLED // Wait for unpacking acSyncCommData(sidexy_data); acSyncCommData(sidexz_data); acSyncCommData(sideyz_data); acSyncCommData(edgex_data); acSyncCommData(edgey_data); acSyncCommData(edgez_data); #if MPI_INCL_CORNERS acSyncCommData(corner_data); // Do not rm: required for corners #endif // MPI_INCL_CORNERS #if MPI_COMPUTE_ENABLED { // Front const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; acKernelIntegrateSubstep(device->streams[STREAM_0], isubstep, m1, m2, device->vba); } { // Back const int3 m1 = (int3){NGHOST, NGHOST, nn.z}; const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; acKernelIntegrateSubstep(device->streams[STREAM_1], isubstep, m1, m2, device->vba); } { // Bottom const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST}; const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; acKernelIntegrateSubstep(device->streams[STREAM_2], isubstep, m1, m2, device->vba); } { // Top const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST}; const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; acKernelIntegrateSubstep(device->streams[STREAM_3], isubstep, m1, m2, device->vba); } { // Left const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST}; const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; acKernelIntegrateSubstep(device->streams[STREAM_4], isubstep, m1, m2, device->vba); } { // Right const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST}; const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; acKernelIntegrateSubstep(device->streams[STREAM_5], isubstep, m1, m2, device->vba); } #endif // MPI_COMPUTE_ENABLED acDeviceSwapBuffers(device); } // Does not have to be STREAM_ALL, only the streams used with // acDeviceIntegrateSubstep (less likely to break this way though) acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done return AC_SUCCESS; } */ AcResult acGridIntegrate(const Stream stream, const AcReal dt) { ERRCHK(grid.initialized); acGridSynchronizeStream(stream); const Device device = grid.device; const int3 nn = grid.nn; #if MPI_INCL_CORNERS CommData corner_data = grid.corner_data; // Do not rm: required for corners #endif // MPI_INCL_CORNERS CommData edgex_data = grid.edgex_data; CommData edgey_data = grid.edgey_data; CommData edgez_data = grid.edgez_data; CommData sidexy_data = grid.sidexy_data; CommData sidexz_data = grid.sidexz_data; CommData sideyz_data = grid.sideyz_data; acGridLoadScalarUniform(stream, AC_dt, dt); acDeviceSynchronizeStream(device, stream); // Check the position in MPI frame int nprocs, pid; MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &pid); const uint3_64 decomposition = decompose(nprocs); const int3 pid3d = getPid3D(pid, decomposition); // Corners #if MPI_INCL_CORNERS // Do not rm: required for corners const int3 corner_b0s[] = { (int3){0, 0, 0}, (int3){NGHOST + nn.x, 0, 0}, (int3){0, NGHOST + nn.y, 0}, (int3){0, 0, NGHOST + nn.z}, (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, (int3){0, NGHOST + nn.y, NGHOST + nn.z}, (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, }; #endif // MPI_INCL_CORNERS // Edges X const int3 edgex_b0s[] = { (int3){NGHOST, 0, 0}, (int3){NGHOST, NGHOST + nn.y, 0}, (int3){NGHOST, 0, NGHOST + nn.z}, (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z}, }; // Edges Y const int3 edgey_b0s[] = { (int3){0, NGHOST, 0}, (int3){NGHOST + nn.x, NGHOST, 0}, (int3){0, NGHOST, NGHOST + nn.z}, (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z}, }; // Edges Z const int3 edgez_b0s[] = { (int3){0, 0, NGHOST}, (int3){NGHOST + nn.x, 0, NGHOST}, (int3){0, NGHOST + nn.y, NGHOST}, (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST}, }; // Sides XY const int3 sidexy_b0s[] = { (int3){NGHOST, NGHOST, 0}, // (int3){NGHOST, NGHOST, NGHOST + nn.z}, // }; // Sides XZ const int3 sidexz_b0s[] = { (int3){NGHOST, 0, NGHOST}, // (int3){NGHOST, NGHOST + nn.y, NGHOST}, // }; // Sides YZ const int3 sideyz_b0s[] = { (int3){0, NGHOST, NGHOST}, // (int3){NGHOST + nn.x, NGHOST, NGHOST}, // }; for (int isubstep = 0; isubstep < 3; ++isubstep) { #if MPI_COMM_ENABLED #if MPI_INCL_CORNERS acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners #endif // MPI_INCL_CORNERS acPackCommData(device, edgex_b0s, &edgex_data); acPackCommData(device, edgey_b0s, &edgey_data); acPackCommData(device, edgez_b0s, &edgez_data); acPackCommData(device, sidexy_b0s, &sidexy_data); acPackCommData(device, sidexz_b0s, &sidexz_data); acPackCommData(device, sideyz_b0s, &sideyz_data); #endif #if MPI_COMM_ENABLED MPI_Barrier(MPI_COMM_WORLD); #if MPI_GPUDIRECT_DISABLED #if MPI_INCL_CORNERS acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners #endif // MPI_INCL_CORNERS acTransferCommDataToHost(device, &edgex_data); acTransferCommDataToHost(device, &edgey_data); acTransferCommDataToHost(device, &edgez_data); acTransferCommDataToHost(device, &sidexy_data); acTransferCommDataToHost(device, &sidexz_data); acTransferCommDataToHost(device, &sideyz_data); #endif #if MPI_INCL_CORNERS acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners #endif // MPI_INCL_CORNERS acTransferCommData(device, edgex_b0s, &edgex_data); acTransferCommData(device, edgey_b0s, &edgey_data); acTransferCommData(device, edgez_b0s, &edgez_data); acTransferCommData(device, sidexy_b0s, &sidexy_data); acTransferCommData(device, sidexz_b0s, &sidexz_data); acTransferCommData(device, sideyz_b0s, &sideyz_data); #endif // MPI_COMM_ENABLED #if MPI_COMPUTE_ENABLED //////////// INNER INTEGRATION ////////////// { const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; const int3 m2 = nn; acKernelIntegrateSubstep(device->streams[STREAM_16], isubstep, m1, m2, device->vba); } //////////////////////////////////////////// #endif // MPI_COMPUTE_ENABLED #if MPI_COMM_ENABLED #if MPI_INCL_CORNERS acTransferCommDataWait(corner_data); // Do not rm: required for corners #endif // MPI_INCL_CORNERS acTransferCommDataWait(edgex_data); acTransferCommDataWait(edgey_data); acTransferCommDataWait(edgez_data); acTransferCommDataWait(sidexy_data); acTransferCommDataWait(sidexz_data); acTransferCommDataWait(sideyz_data); #if MPI_INCL_CORNERS acUnpinCommData(device, &corner_data); // Do not rm: required for corners #endif // MPI_INCL_CORNERS acUnpinCommData(device, &edgex_data); acUnpinCommData(device, &edgey_data); acUnpinCommData(device, &edgez_data); acUnpinCommData(device, &sidexy_data); acUnpinCommData(device, &sidexz_data); acUnpinCommData(device, &sideyz_data); #if MPI_INCL_CORNERS acUnpackCommData(device, corner_b0s, &corner_data); #endif // MPI_INCL_CORNERS acUnpackCommData(device, edgex_b0s, &edgex_data); acUnpackCommData(device, edgey_b0s, &edgey_data); acUnpackCommData(device, edgez_b0s, &edgez_data); acUnpackCommData(device, sidexy_b0s, &sidexy_data); acUnpackCommData(device, sidexz_b0s, &sidexz_data); acUnpackCommData(device, sideyz_b0s, &sideyz_data); //////////// OUTER INTEGRATION ////////////// // Wait for unpacking #if MPI_INCL_CORNERS acSyncCommData(corner_data); // Do not rm: required for corners #endif // MPI_INCL_CORNERS acSyncCommData(edgex_data); acSyncCommData(edgey_data); acSyncCommData(edgez_data); acSyncCommData(sidexy_data); acSyncCommData(sidexz_data); acSyncCommData(sideyz_data); #endif // MPI_COMM_ENABLED // Set outer boudaries after substep computation. const int3 m1 = (int3){0, 0, 0}; const int3 m2 = nn; const int3 pid3d = getPid3D(pid, decomposition); // If we are are a boundary element int3 bindex = (int3){0, 0, 0}; // 1 is top edge, 2 bottom edge, 3 both edges, 0 is no boundary if (pid3d.x == 0) { bindex.x = 1; } else if (pid3d.x == decomposition.x - 1) { bindex.x = 2; } else if ((pid3d.x == 0) && (pid3d.x == decomposition.x - 1)) { bindex.x = 3; } if (pid3d.y == 0) { bindex.y = 1; } else if (pid3d.y == decomposition.y - 1) { bindex.y = 2; } else if ((pid3d.y == 0) && (pid3d.y == decomposition.y - 1)) { bindex.y = 3; } if (pid3d.z == 0) { bindex.z = 1; } else if (pid3d.z == decomposition.z - 1) { bindex.z = 2; } else if ((pid3d.z == 0) && (pid3d.z == decomposition.z - 1)) { bindex.z = 3; } { //TODO get bound_direction -> bindex acDeviceGeneralBoundconds(device, stream, m1, m2, bindex); } acGridSynchronizeStream(stream); #if MPI_COMPUTE_ENABLED { // Front const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; acKernelIntegrateSubstep(device->streams[STREAM_0], isubstep, m1, m2, device->vba); } { // Back const int3 m1 = (int3){NGHOST, NGHOST, nn.z}; const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; acKernelIntegrateSubstep(device->streams[STREAM_1], isubstep, m1, m2, device->vba); } { // Bottom const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST}; const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; acKernelIntegrateSubstep(device->streams[STREAM_2], isubstep, m1, m2, device->vba); } { // Top const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST}; const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; acKernelIntegrateSubstep(device->streams[STREAM_3], isubstep, m1, m2, device->vba); } { // Left const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST}; const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; acKernelIntegrateSubstep(device->streams[STREAM_4], isubstep, m1, m2, device->vba); } { // Right const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST}; const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; acKernelIntegrateSubstep(device->streams[STREAM_5], isubstep, m1, m2, device->vba); } #endif // MPI_COMPUTE_ENABLED acDeviceSwapBuffers(device); acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done //////////////////////////////////////////// } return AC_SUCCESS; } AcResult acGridPeriodicBoundconds(const Stream stream) { ERRCHK(grid.initialized); acGridSynchronizeStream(stream); const Device device = grid.device; const int3 nn = grid.nn; CommData corner_data = grid.corner_data; CommData edgex_data = grid.edgex_data; CommData edgey_data = grid.edgey_data; CommData edgez_data = grid.edgez_data; CommData sidexy_data = grid.sidexy_data; CommData sidexz_data = grid.sidexz_data; CommData sideyz_data = grid.sideyz_data; // Corners const int3 corner_b0s[] = { (int3){0, 0, 0}, (int3){NGHOST + nn.x, 0, 0}, (int3){0, NGHOST + nn.y, 0}, (int3){0, 0, NGHOST + nn.z}, (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, (int3){0, NGHOST + nn.y, NGHOST + nn.z}, (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, }; // Edges X const int3 edgex_b0s[] = { (int3){NGHOST, 0, 0}, (int3){NGHOST, NGHOST + nn.y, 0}, (int3){NGHOST, 0, NGHOST + nn.z}, (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z}, }; // Edges Y const int3 edgey_b0s[] = { (int3){0, NGHOST, 0}, (int3){NGHOST + nn.x, NGHOST, 0}, (int3){0, NGHOST, NGHOST + nn.z}, (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z}, }; // Edges Z const int3 edgez_b0s[] = { (int3){0, 0, NGHOST}, (int3){NGHOST + nn.x, 0, NGHOST}, (int3){0, NGHOST + nn.y, NGHOST}, (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST}, }; // Sides XY const int3 sidexy_b0s[] = { (int3){NGHOST, NGHOST, 0}, // (int3){NGHOST, NGHOST, NGHOST + nn.z}, // }; // Sides XZ const int3 sidexz_b0s[] = { (int3){NGHOST, 0, NGHOST}, // (int3){NGHOST, NGHOST + nn.y, NGHOST}, // }; // Sides YZ const int3 sideyz_b0s[] = { (int3){0, NGHOST, NGHOST}, // (int3){NGHOST + nn.x, NGHOST, NGHOST}, // }; acPackCommData(device, corner_b0s, &corner_data); acPackCommData(device, edgex_b0s, &edgex_data); acPackCommData(device, edgey_b0s, &edgey_data); acPackCommData(device, edgez_b0s, &edgez_data); acPackCommData(device, sidexy_b0s, &sidexy_data); acPackCommData(device, sidexz_b0s, &sidexz_data); acPackCommData(device, sideyz_b0s, &sideyz_data); MPI_Barrier(MPI_COMM_WORLD); #if MPI_GPUDIRECT_DISABLED acTransferCommDataToHost(device, &corner_data); acTransferCommDataToHost(device, &edgex_data); acTransferCommDataToHost(device, &edgey_data); acTransferCommDataToHost(device, &edgez_data); acTransferCommDataToHost(device, &sidexy_data); acTransferCommDataToHost(device, &sidexz_data); acTransferCommDataToHost(device, &sideyz_data); #endif acTransferCommData(device, corner_b0s, &corner_data); acTransferCommData(device, edgex_b0s, &edgex_data); acTransferCommData(device, edgey_b0s, &edgey_data); acTransferCommData(device, edgez_b0s, &edgez_data); acTransferCommData(device, sidexy_b0s, &sidexy_data); acTransferCommData(device, sidexz_b0s, &sidexz_data); acTransferCommData(device, sideyz_b0s, &sideyz_data); acTransferCommDataWait(corner_data); acTransferCommDataWait(edgex_data); acTransferCommDataWait(edgey_data); acTransferCommDataWait(edgez_data); acTransferCommDataWait(sidexy_data); acTransferCommDataWait(sidexz_data); acTransferCommDataWait(sideyz_data); #if MPI_GPUDIRECT_DISABLED acTransferCommDataToDevice(device, &corner_data); acTransferCommDataToDevice(device, &edgex_data); acTransferCommDataToDevice(device, &edgey_data); acTransferCommDataToDevice(device, &edgez_data); acTransferCommDataToDevice(device, &sidexy_data); acTransferCommDataToDevice(device, &sidexz_data); acTransferCommDataToDevice(device, &sideyz_data); #endif acUnpinCommData(device, &corner_data); acUnpinCommData(device, &edgex_data); acUnpinCommData(device, &edgey_data); acUnpinCommData(device, &edgez_data); acUnpinCommData(device, &sidexy_data); acUnpinCommData(device, &sidexz_data); acUnpinCommData(device, &sideyz_data); acUnpackCommData(device, corner_b0s, &corner_data); acUnpackCommData(device, edgex_b0s, &edgex_data); acUnpackCommData(device, edgey_b0s, &edgey_data); acUnpackCommData(device, edgez_b0s, &edgez_data); acUnpackCommData(device, sidexy_b0s, &sidexy_data); acUnpackCommData(device, sidexz_b0s, &sidexz_data); acUnpackCommData(device, sideyz_b0s, &sideyz_data); // Wait for unpacking acSyncCommData(corner_data); acSyncCommData(edgex_data); acSyncCommData(edgey_data); acSyncCommData(edgez_data); acSyncCommData(sidexy_data); acSyncCommData(sidexz_data); acSyncCommData(sideyz_data); return AC_SUCCESS; } static AcResult acMPIReduceScal(const AcReal local_result, const ReductionType rtype, AcReal* result) { MPI_Op op; if (rtype == RTYPE_MAX) { op = MPI_MAX; } else if (rtype == RTYPE_MIN) { op = MPI_MIN; } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) { op = MPI_SUM; } else { ERROR("Unrecognised rtype"); } #if AC_DOUBLE_PRECISION == 1 MPI_Datatype datatype = MPI_DOUBLE; #else MPI_Datatype datatype = MPI_FLOAT; #endif int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); int world_size; MPI_Comm_size(MPI_COMM_WORLD, &world_size); AcReal mpi_res; MPI_Reduce(&local_result, &mpi_res, 1, datatype, op, 0, MPI_COMM_WORLD); if (rank == 0) { if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { const AcReal inv_n = AcReal(1.) / (grid.nn.x * grid.decomposition.x * grid.nn.y * grid.decomposition.y * grid.nn.z * grid.decomposition.z); mpi_res = sqrt(inv_n * mpi_res); } *result = mpi_res; } return AC_SUCCESS; } AcResult acGridReduceScal(const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf_handle, AcReal* result) { ERRCHK(grid.initialized); const Device device = grid.device; acGridSynchronizeStream(STREAM_ALL); // MPI_Barrier(MPI_COMM_WORLD); AcReal local_result; acDeviceReduceScal(device, stream, rtype, vtxbuf_handle, &local_result); return acMPIReduceScal(local_result, rtype, result); } AcResult acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, const VertexBufferHandle vtxbuf2, AcReal* result) { ERRCHK(grid.initialized); const Device device = grid.device; acGridSynchronizeStream(STREAM_ALL); // MPI_Barrier(MPI_COMM_WORLD); AcReal local_result; acDeviceReduceVec(device, stream, rtype, vtxbuf0, vtxbuf1, vtxbuf2, &local_result); return acMPIReduceScal(local_result, rtype, result); } //MV: for MPI we will need acGridReduceVecScal() to get Alfven speeds etc. TODO #endif // AC_MPI_ENABLED