Added Astaroth 2.0
This commit is contained in:
309
src/core/device.cu
Normal file
309
src/core/device.cu
Normal file
@@ -0,0 +1,309 @@
|
||||
/*
|
||||
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#include "device.cuh"
|
||||
|
||||
#include "errchk.h"
|
||||
|
||||
typedef struct {
|
||||
AcReal* in[NUM_VTXBUF_HANDLES];
|
||||
AcReal* out[NUM_VTXBUF_HANDLES];
|
||||
} VertexBufferArray;
|
||||
|
||||
__constant__ AcMeshInfo d_mesh_info;
|
||||
#define DCONST_INT(X) (d_mesh_info.int_params[X])
|
||||
#define DCONST_REAL(X) (d_mesh_info.real_params[X])
|
||||
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
|
||||
#include "kernels/kernels.cuh"
|
||||
|
||||
struct device_s {
|
||||
int id;
|
||||
AcMeshInfo local_config;
|
||||
|
||||
// Concurrency
|
||||
cudaStream_t streams[NUM_STREAM_TYPES];
|
||||
|
||||
// Memory
|
||||
VertexBufferArray vba;
|
||||
AcReal* reduce_scratchpad;
|
||||
AcReal* reduce_result;
|
||||
};
|
||||
|
||||
AcResult
|
||||
printDeviceInfo(const Device device)
|
||||
{
|
||||
const int device_id = device->id;
|
||||
|
||||
cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, device_id);
|
||||
printf("--------------------------------------------------\n");
|
||||
printf("Device Number: %d\n", device_id);
|
||||
const size_t bus_id_max_len = 128;
|
||||
char bus_id[bus_id_max_len];
|
||||
cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
|
||||
printf(" PCI bus ID: %s\n", bus_id);
|
||||
printf(" Device name: %s\n", props.name);
|
||||
printf(" Compute capability: %d.%d\n", props.major, props.minor);
|
||||
|
||||
// Compute
|
||||
printf(" Compute\n");
|
||||
printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
|
||||
printf(" Stream processors: %d\n", props.multiProcessorCount);
|
||||
printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
|
||||
printf(" Compute mode: %d\n", (int)props.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
|
||||
// Memory
|
||||
printf(" Global memory\n");
|
||||
printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
|
||||
printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth);
|
||||
printf(" Peak Memory Bandwidth (GiB/s): %f\n",
|
||||
2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth /
|
||||
(8. * 1024. * 1024. * 1024.));
|
||||
printf(" ECC enabled: %d\n", props.ECCEnabled);
|
||||
// Memory usage
|
||||
size_t free_bytes, total_bytes;
|
||||
cudaMemGetInfo(&free_bytes, &total_bytes);
|
||||
const size_t used_bytes = total_bytes - free_bytes;
|
||||
printf(" Total global mem: %.2f GiB\n",
|
||||
props.totalGlobalMem / (1024.0 * 1024 * 1024));
|
||||
printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
|
||||
printf(" Gmem memory free (GiB): %.2f\n",
|
||||
free_bytes / (1024.0 * 1024 * 1024));
|
||||
printf(" Gmem memory total (GiB): %.2f\n",
|
||||
total_bytes / (1024.0 * 1024 * 1024));
|
||||
printf(" Caches\n");
|
||||
printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported);
|
||||
printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported);
|
||||
printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024));
|
||||
printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024));
|
||||
printf(" Shared mem per block: %ld KiB\n",
|
||||
props.sharedMemPerBlock / (1024));
|
||||
printf(" Other\n");
|
||||
printf(" Warp size: %d\n", props.warpSize);
|
||||
// printf(" Single to double perf. ratio: %dx\n",
|
||||
// props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
|
||||
// versions
|
||||
printf(" Stream priorities supported: %d\n",
|
||||
props.streamPrioritiesSupported);
|
||||
printf("--------------------------------------------------\n");
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static __global__ void dummy_kernel(void) {}
|
||||
|
||||
AcResult
|
||||
createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
|
||||
{
|
||||
cudaSetDevice(id);
|
||||
cudaDeviceReset();
|
||||
|
||||
// Create Device
|
||||
struct device_s* device = (struct device_s*) malloc(sizeof(*device));
|
||||
ERRCHK_ALWAYS(device);
|
||||
|
||||
device->id = id;
|
||||
device->local_config = device_config;
|
||||
|
||||
// Check that the code was compiled for the proper GPU architecture
|
||||
printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
|
||||
"device supports the CUDA architecture you are compiling for.\n"
|
||||
"Running dummy kernel... ");
|
||||
fflush(stdout);
|
||||
dummy_kernel<<<1, 1>>>();
|
||||
ERRCHK_CUDA_KERNEL_ALWAYS();
|
||||
printf("Success!\n");
|
||||
|
||||
// Concurrency
|
||||
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
|
||||
cudaStreamCreate(&device->streams[i]);
|
||||
}
|
||||
|
||||
// Memory
|
||||
const size_t vba_size_bytes = AC_VTXBUF_SIZE_BYTES(device_config);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
|
||||
}
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
|
||||
AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
|
||||
|
||||
// Device constants
|
||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
|
||||
cudaMemcpyHostToDevice));
|
||||
|
||||
printf("Created device %d (%p)\n", device->id, device);
|
||||
*device_handle = device;
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
destroyDevice(Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
printf("Destroying device %d (%p)\n", device->id, device);
|
||||
|
||||
// Memory
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
cudaFree(device->vba.in[i]);
|
||||
cudaFree(device->vba.out[i]);
|
||||
}
|
||||
cudaFree(device->reduce_scratchpad);
|
||||
cudaFree(device->reduce_result);
|
||||
|
||||
// Concurrency
|
||||
for (int i = 0; i < NUM_STREAM_TYPES; ++i)
|
||||
cudaStreamDestroy(device->streams[i]);
|
||||
|
||||
// Destroy Device
|
||||
free(device);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
reduceScal(const Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
reduceVec(const Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
rkStep(const Device device, const StreamType stream_type, const int step_number,
|
||||
const int3& start, const int3& end, const AcReal dt)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
synchronize(const Device device, const StreamType stream_type)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
if (stream_type == STREAM_ALL) {
|
||||
cudaDeviceSynchronize();
|
||||
} else {
|
||||
cudaStreamSynchronize(device->streams[stream_type]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
loadWithOffset(const Device device, const StreamType stream_type,
|
||||
const AcReal* src, const size_t bytes, AcReal* dst)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice,
|
||||
device->streams[stream_type]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
storeWithOffset(const Device device, const StreamType stream_type,
|
||||
const AcReal* src, const size_t bytes, AcReal* dst)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost,
|
||||
device->streams[stream_type]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
copyMeshToDevice(const Device device, const StreamType stream_type,
|
||||
const AcMesh& host_mesh, const int3& src, const int3& dst,
|
||||
const int num_vertices)
|
||||
{
|
||||
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info);
|
||||
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal),
|
||||
&device->vba.in[i][dst_idx]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
copyMeshToHost(const Device device, const StreamType stream_type,
|
||||
const int3& src, const int3& dst, const int num_vertices,
|
||||
AcMesh* host_mesh)
|
||||
{
|
||||
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config);
|
||||
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
|
||||
num_vertices * sizeof(AcReal),
|
||||
&host_mesh->vertex_buffer[i][dst_idx]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type,
|
||||
const int3& src, Device dst_device, const int3& dst,
|
||||
const int num_vertices)
|
||||
{
|
||||
cudaSetDevice(src_device->id);
|
||||
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config);
|
||||
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, dst_device->local_config);
|
||||
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id,
|
||||
&src_device->vba.in[i][src_idx], src_device->id,
|
||||
sizeof(src_device->vba.in[i][0]) * num_vertices,
|
||||
src_device->streams[stream_type]));
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
AcResult
|
||||
swapBuffers(const Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
AcReal* tmp = device->vba.in[i];
|
||||
device->vba.in[i] = device->vba.out[i];
|
||||
device->vba.out[i] = tmp;
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
Reference in New Issue
Block a user