Files
astaroth/src/core/device.cu

638 lines
23 KiB
Plaintext

/*
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#include "astaroth_device.h"
#include "errchk.h"
// Device info
#define REGISTERS_PER_THREAD (255)
#define MAX_REGISTERS_PER_BLOCK (65536)
#define MAX_THREADS_PER_BLOCK (1024)
#define WARP_SIZE (32)
typedef struct {
AcReal* in[NUM_VTXBUF_HANDLES];
AcReal* out[NUM_VTXBUF_HANDLES];
} VertexBufferArray;
__constant__ AcMeshInfo d_mesh_info;
static inline int __device__
DCONST(const AcIntParam param)
{
return d_mesh_info.int_params[param];
}
static inline int3 __device__
DCONST(const AcInt3Param param)
{
return d_mesh_info.int3_params[param];
}
static inline AcReal __device__
DCONST(const AcRealParam param)
{
return d_mesh_info.real_params[param];
}
static inline AcReal3 __device__
DCONST(const AcReal3Param param)
{
return d_mesh_info.real3_params[param];
}
#define DCONST_INT(x) DCONST(x)
#define DCONST_INT3(x) DCONST(x)
#define DCONST_REAL(x) DCONST(x)
#define DCONST_REAL3(x) DCONST(x)
//#define DCONST_INT(X) (d_mesh_info.int_params[X])
//#define DCONST_INT3(X) (d_mesh_info.int3_params[X])
//#define DCONST_REAL(X) (d_mesh_info.real_params[X])
//#define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
//#define globalMeshM // Placeholder
//#define localMeshN // Placeholder
//#define localMeshM // Placeholder
//#define localMeshN_min // Placeholder
//#define globalMeshN_min // Placeholder
#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
#include "kernels/boundconds.cuh"
#include "kernels/integration.cuh"
#include "kernels/reductions.cuh"
static dim3 rk3_tpb(32, 1, 4);
#if PACKED_DATA_TRANSFERS // Defined in device.cuh
// #include "kernels/pack_unpack.cuh"
#endif
struct device_s {
int id;
AcMeshInfo local_config;
// Concurrency
cudaStream_t streams[NUM_STREAM_TYPES];
// Memory
VertexBufferArray vba;
AcReal* reduce_scratchpad;
AcReal* reduce_result;
#if PACKED_DATA_TRANSFERS
// Declare memory for buffers needed for packed data transfers here
// AcReal* data_packing_buffer;
#endif
};
// clang-format off
static __global__ void dummy_kernel(void) {}
// clang-format on
AcResult
acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
{
cudaSetDevice(id);
cudaDeviceReset();
// Create Device
struct device_s* device = (struct device_s*)malloc(sizeof(*device));
ERRCHK_ALWAYS(device);
device->id = id;
device->local_config = device_config;
acDevicePrintInfo(device);
// Check that the code was compiled for the proper GPU architecture
printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
"device supports the CUDA architecture you are compiling for.\n"
"Running dummy kernel... ");
fflush(stdout);
dummy_kernel<<<1, 1>>>();
ERRCHK_CUDA_KERNEL_ALWAYS();
printf("Success!\n");
// Concurrency
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
}
// Memory
const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
}
ERRCHK_CUDA_ALWAYS(
cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
#if PACKED_DATA_TRANSFERS
// Allocate data required for packed transfers here (cudaMalloc)
#endif
// Device constants
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
cudaMemcpyHostToDevice));
printf("Created device %d (%p)\n", device->id, device);
*device_handle = device;
// Autoptimize
if (id == 0) {
acDeviceAutoOptimize(device);
}
return AC_SUCCESS;
}
AcResult
acDeviceDestroy(Device device)
{
cudaSetDevice(device->id);
printf("Destroying device %d (%p)\n", device->id, device);
// Memory
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
cudaFree(device->vba.in[i]);
cudaFree(device->vba.out[i]);
}
cudaFree(device->reduce_scratchpad);
cudaFree(device->reduce_result);
#if PACKED_DATA_TRANSFERS
// Free data required for packed tranfers here (cudaFree)
#endif
// Concurrency
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
cudaStreamDestroy(device->streams[i]);
}
// Destroy Device
free(device);
return AC_SUCCESS;
}
AcResult
acDevicePrintInfo(const Device device)
{
const int device_id = device->id;
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device_id);
printf("--------------------------------------------------\n");
printf("Device Number: %d\n", device_id);
const size_t bus_id_max_len = 128;
char bus_id[bus_id_max_len];
cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
printf(" PCI bus ID: %s\n", bus_id);
printf(" Device name: %s\n", props.name);
printf(" Compute capability: %d.%d\n", props.major, props.minor);
// Compute
printf(" Compute\n");
printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
printf(" Stream processors: %d\n", props.multiProcessorCount);
printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
printf(
" Compute mode: %d\n",
(int)props
.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
// Memory
printf(" Global memory\n");
printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth);
printf(" Peak Memory Bandwidth (GiB/s): %f\n",
2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
printf(" ECC enabled: %d\n", props.ECCEnabled);
// Memory usage
size_t free_bytes, total_bytes;
cudaMemGetInfo(&free_bytes, &total_bytes);
const size_t used_bytes = total_bytes - free_bytes;
printf(" Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
printf(" Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
printf(" Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
printf(" Caches\n");
printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported);
printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported);
printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024));
// MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
// MV: while compiling in TIARA gp cluster. Therefore commeted out.
//!! printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024));
//!! printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
printf(" Other\n");
printf(" Warp size: %d\n", props.warpSize);
// printf(" Single to double perf. ratio: %dx\n",
// props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
// versions
printf(" Stream priorities supported: %d\n", props.streamPrioritiesSupported);
printf("--------------------------------------------------\n");
return AC_SUCCESS;
}
AcResult
acDeviceAutoOptimize(const Device device)
{
cudaSetDevice(device->id);
// RK3
const int3 start = (int3){NGHOST, NGHOST, NGHOST};
const int3 end = start + (int3){device->local_config.int_params[AC_nx], //
device->local_config.int_params[AC_ny], //
device->local_config.int_params[AC_nz]};
dim3 best_dims(0, 0, 0);
float best_time = INFINITY;
const int num_iterations = 10;
for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
break;
if (x * y * z > MAX_THREADS_PER_BLOCK)
break;
if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
break;
if (((x * y * z) % WARP_SIZE) != 0)
continue;
const dim3 tpb(x, y, z);
const int3 n = end - start;
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
(unsigned int)ceil(n.z / AcReal(tpb.z)));
cudaDeviceSynchronize();
if (cudaGetLastError() != cudaSuccess) // resets the error if any
continue;
// printf("(%d, %d, %d)\n", x, y, z);
cudaEvent_t tstart, tstop;
cudaEventCreate(&tstart);
cudaEventCreate(&tstop);
cudaEventRecord(tstart); // ---------------------------------------- Timing start
for (int i = 0; i < num_iterations; ++i)
solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON);
cudaEventRecord(tstop); // ----------------------------------------- Timing end
cudaEventSynchronize(tstop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, tstart, tstop);
ERRCHK_CUDA_KERNEL_ALWAYS();
if (milliseconds < best_time) {
best_time = milliseconds;
best_dims = tpb;
}
}
}
}
#if VERBOSE_PRINTING
printf(
"Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
#endif
/*
FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
ERRCHK(fp);
fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
fclose(fp);
*/
rk3_tpb = best_dims;
return AC_SUCCESS;
}
AcResult
acDeviceSynchronizeStream(const Device device, const Stream stream)
{
cudaSetDevice(device->id);
if (stream == STREAM_ALL) {
cudaDeviceSynchronize();
}
else {
cudaStreamSynchronize(device->streams[stream]);
}
return AC_SUCCESS;
}
AcResult
acDeviceSwapBuffers(const Device device)
{
cudaSetDevice(device->id);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
AcReal* tmp = device->vba.in[i];
device->vba.in[i] = device->vba.out[i];
device->vba.out[i] = tmp;
}
return AC_SUCCESS;
}
AcResult
acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param,
const AcReal value)
{
cudaSetDevice(device->id);
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices)
{
cudaSetDevice(device->id);
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);
const AcReal* src_ptr = &host_mesh.vertex_buffer[vtxbuf_handle][src_idx];
AcReal* dst_ptr = &device->vba.in[vtxbuf_handle][dst_idx];
const size_t bytes = num_vertices * sizeof(src_ptr[0]);
ERRCHK_CUDA( //
cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyHostToDevice, device->streams[stream]) //
);
return AC_SUCCESS;
}
AcResult
acDeviceLoadMeshWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
const int3 src, const int3 dst, const int num_vertices)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, (VertexBufferHandle)i, src,
dst, num_vertices);
}
return AC_SUCCESS;
}
AcResult
acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle)
{
const int3 src = (int3){0, 0, 0};
const int3 dst = src;
const size_t num_vertices = acVertexBufferSize(device->local_config);
acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, vtxbuf_handle, src, dst,
num_vertices);
return AC_SUCCESS;
}
AcResult
acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceLoadVertexBuffer(device, stream, host_mesh, (VertexBufferHandle)i);
}
return AC_SUCCESS;
}
AcResult
acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices, AcMesh* host_mesh)
{
cudaSetDevice(device->id);
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
const AcReal* src_ptr = &device->vba.in[vtxbuf_handle][src_idx];
AcReal* dst_ptr = &host_mesh->vertex_buffer[vtxbuf_handle][dst_idx];
const size_t bytes = num_vertices * sizeof(src_ptr[0]);
ERRCHK_CUDA( //
cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyDeviceToHost, device->streams[stream]) //
);
return AC_SUCCESS;
}
AcResult
acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
const int3 dst, const int num_vertices, AcMesh* host_mesh)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceStoreVertexBufferWithOffset(device, stream, (VertexBufferHandle)i, src, dst,
num_vertices, host_mesh);
}
return AC_SUCCESS;
}
AcResult
acDeviceStoreVertexBuffer(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
{
int3 src = (int3){0, 0, 0};
int3 dst = src;
const size_t num_vertices = acVertexBufferSize(device->local_config);
acDeviceStoreVertexBufferWithOffset(device, stream, vtxbuf_handle, src, dst, num_vertices,
host_mesh);
return AC_SUCCESS;
}
AcResult
acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceStoreVertexBuffer(device, stream, (VertexBufferHandle)i, host_mesh);
}
return AC_SUCCESS;
}
AcResult
acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices, Device dst_device)
{
cudaSetDevice(src_device->id);
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);
const AcReal* src_ptr = &src_device->vba.in[vtxbuf_handle][src_idx];
AcReal* dst_ptr = &dst_device->vba.in[vtxbuf_handle][dst_idx];
const size_t bytes = num_vertices * sizeof(src_ptr[0]);
ERRCHK_CUDA(cudaMemcpyPeerAsync(dst_ptr, dst_device->id, src_ptr, src_device->id, bytes,
src_device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, const int3 src,
const int3 dst, const int num_vertices, Device dst_device)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceTransferVertexBufferWithOffset(src_device, stream, (VertexBufferHandle)i, src, dst,
num_vertices, dst_device);
}
return AC_SUCCESS;
}
AcResult
acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, Device dst_device)
{
int3 src = (int3){0, 0, 0};
int3 dst = src;
const size_t num_vertices = acVertexBufferSize(src_device->local_config);
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
num_vertices, dst_device);
return AC_SUCCESS;
}
AcResult
acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceTransferVertexBuffer(src_device, stream, (VertexBufferHandle)i, dst_device);
}
return AC_SUCCESS;
}
AcResult
acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
const int3 start, const int3 end, const AcReal dt)
{
cudaSetDevice(device->id);
const dim3 tpb = rk3_tpb;
const int3 n = end - start;
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
(unsigned int)ceil(n.z / AcReal(tpb.z)));
if (step_number == 0)
solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
else if (step_number == 1)
solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
else
solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}
AcResult
acDevicePeriodicBoundcondStep(const Device device, const Stream stream_type,
const VertexBufferHandle vtxbuf_handle, const int3 start,
const int3 end)
{
cudaSetDevice(device->id);
const cudaStream_t stream = device->streams[stream_type];
const dim3 tpb(8, 2, 8);
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, device->vba.in[vtxbuf_handle]);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}
AcResult
acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
const int3 end)
{
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDevicePeriodicBoundcondStep(device, stream, (VertexBufferHandle)i, start, end);
}
return AC_SUCCESS;
}
AcResult
acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result)
{
cudaSetDevice(device->id);
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
device->local_config.int_params[AC_ny_min],
device->local_config.int_params[AC_nz_min]};
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max]};
*result = reduce_scal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
device->reduce_scratchpad, device->reduce_result);
return AC_SUCCESS;
}
AcResult
acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2, AcReal* result)
{
cudaSetDevice(device->id);
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
device->local_config.int_params[AC_ny_min],
device->local_config.int_params[AC_nz_min]};
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max]};
*result = reduce_vec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
device->reduce_scratchpad, device->reduce_result);
return AC_SUCCESS;
}
#if PACKED_DATA_TRANSFERS
// Functions for calling packed data transfers
#endif