/* Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. This file is part of Astaroth. Astaroth is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Astaroth is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Astaroth. If not, see . */ /** * @file * \brief Brief info. * * Detailed info. * */ #include "device.cuh" #include "errchk.h" // Device info #define REGISTERS_PER_THREAD (255) #define MAX_REGISTERS_PER_BLOCK (65536) #define MAX_THREADS_PER_BLOCK (1024) #define WARP_SIZE (32) typedef struct { AcReal* in[NUM_VTXBUF_HANDLES]; AcReal* out[NUM_VTXBUF_HANDLES]; } VertexBufferArray; __constant__ AcMeshInfo d_mesh_info; __constant__ int3 d_multigpu_offset; __constant__ Grid globalGrid; #define DCONST_INT(X) (d_mesh_info.int_params[X]) #define DCONST_INT3(X) (d_mesh_info.int3_params[X]) #define DCONST_REAL(X) (d_mesh_info.real_params[X]) #define DCONST_REAL3(X) (d_mesh_info.real3_params[X]) #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy)) #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy)) #include "kernels/kernels.cuh" static dim3 rk3_tpb = (dim3){32, 1, 4}; #if PACKED_DATA_TRANSFERS // Defined in device.cuh // #include "kernels/pack_unpack.cuh" #endif struct device_s { int id; AcMeshInfo local_config; // Concurrency cudaStream_t streams[NUM_STREAM_TYPES]; // Memory VertexBufferArray vba; AcReal* reduce_scratchpad; AcReal* reduce_result; #if PACKED_DATA_TRANSFERS // Declare memory for buffers needed for packed data transfers here // AcReal* data_packing_buffer; #endif }; AcResult printDeviceInfo(const Device device) { const int device_id = device->id; cudaDeviceProp props; cudaGetDeviceProperties(&props, device_id); printf("--------------------------------------------------\n"); printf("Device Number: %d\n", device_id); const size_t bus_id_max_len = 128; char bus_id[bus_id_max_len]; cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id); printf(" PCI bus ID: %s\n", bus_id); printf(" Device name: %s\n", props.name); printf(" Compute capability: %d.%d\n", props.major, props.minor); // Compute printf(" Compute\n"); printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz printf(" Stream processors: %d\n", props.multiProcessorCount); printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio); printf( " Compute mode: %d\n", (int)props .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0 // Memory printf(" Global memory\n"); printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000)); printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth); printf(" Peak Memory Bandwidth (GiB/s): %f\n", 2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.)); printf(" ECC enabled: %d\n", props.ECCEnabled); // Memory usage size_t free_bytes, total_bytes; cudaMemGetInfo(&free_bytes, &total_bytes); const size_t used_bytes = total_bytes - free_bytes; printf(" Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024)); printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024)); printf(" Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024)); printf(" Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024)); printf(" Caches\n"); printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported); printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported); printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024)); // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error // MV: while compiling in TIARA gp cluster. Therefore commeted out. //!! printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024)); //!! printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024)); printf(" Other\n"); printf(" Warp size: %d\n", props.warpSize); // printf(" Single to double perf. ratio: %dx\n", // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA // versions printf(" Stream priorities supported: %d\n", props.streamPrioritiesSupported); printf("--------------------------------------------------\n"); return AC_SUCCESS; } static __global__ void dummy_kernel(void) { } AcResult createDevice(const int id, const AcMeshInfo device_config, Device* device_handle) { cudaSetDevice(id); cudaDeviceReset(); // Create Device struct device_s* device = (struct device_s*)malloc(sizeof(*device)); ERRCHK_ALWAYS(device); device->id = id; device->local_config = device_config; // Check that the code was compiled for the proper GPU architecture printf("Trying to run a dummy kernel. If this fails, make sure that your\n" "device supports the CUDA architecture you are compiling for.\n" "Running dummy kernel... "); fflush(stdout); dummy_kernel<<<1, 1>>>(); ERRCHK_CUDA_KERNEL_ALWAYS(); printf("Success!\n"); // Concurrency for (int i = 0; i < NUM_STREAM_TYPES; ++i) { cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0); } // Memory const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes)); ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes)); } ERRCHK_CUDA_ALWAYS( cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config))); ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal))); #if PACKED_DATA_TRANSFERS // Allocate data required for packed transfers here (cudaMalloc) #endif // Device constants ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0, cudaMemcpyHostToDevice)); // Multi-GPU offset. This is used to compute globalVertexIdx. // Might be better to calculate this in astaroth.cu instead of here, s.t. // everything related to the decomposition is limited to the multi-GPU layer const int3 multigpu_offset = (int3){0, 0, device->id * device->local_config.int_params[AC_nz]}; ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset, sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice)); printf("Created device %d (%p)\n", device->id, device); *device_handle = device; // Autoptimize if (id == 0) autoOptimize(device); return AC_SUCCESS; } AcResult destroyDevice(Device device) { cudaSetDevice(device->id); printf("Destroying device %d (%p)\n", device->id, device); // Memory for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { cudaFree(device->vba.in[i]); cudaFree(device->vba.out[i]); } cudaFree(device->reduce_scratchpad); cudaFree(device->reduce_result); #if PACKED_DATA_TRANSFERS // Free data required for packed tranfers here (cudaFree) #endif // Concurrency for (int i = 0; i < NUM_STREAM_TYPES; ++i) { cudaStreamDestroy(device->streams[i]); } // Destroy Device free(device); return AC_SUCCESS; } AcResult boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end) { cudaSetDevice(device->id); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]); } return AC_SUCCESS; } AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype, const VertexBufferHandle vtxbuf_handle, AcReal* result) { cudaSetDevice(device->id); const int3 start = (int3){device->local_config.int_params[AC_nx_min], device->local_config.int_params[AC_ny_min], device->local_config.int_params[AC_nz_min]}; const int3 end = (int3){device->local_config.int_params[AC_nx_max], device->local_config.int_params[AC_ny_max], device->local_config.int_params[AC_nz_max]}; *result = reduce_scal(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf_handle], device->reduce_scratchpad, device->reduce_result); return AC_SUCCESS; } AcResult reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype, const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, const VertexBufferHandle vtxbuf2, AcReal* result) { cudaSetDevice(device->id); const int3 start = (int3){device->local_config.int_params[AC_nx_min], device->local_config.int_params[AC_ny_min], device->local_config.int_params[AC_nz_min]}; const int3 end = (int3){device->local_config.int_params[AC_nx_max], device->local_config.int_params[AC_ny_max], device->local_config.int_params[AC_nz_max]}; *result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0], device->vba.in[vtxbuf1], device->vba.in[vtxbuf2], device->reduce_scratchpad, device->reduce_result); return AC_SUCCESS; } AcResult rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start, const int3& end, const AcReal dt) { cudaSetDevice(device->id); // const dim3 tpb(32, 1, 4); const dim3 tpb = rk3_tpb; const int3 n = end - start; const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // (unsigned int)ceil(n.y / AcReal(tpb.y)), // (unsigned int)ceil(n.z / AcReal(tpb.z))); if (step_number == 0) solve<0><<streams[stream_type]>>>(start, end, device->vba, dt); else if (step_number == 1) solve<1><<streams[stream_type]>>>(start, end, device->vba, dt); else solve<2><<streams[stream_type]>>>(start, end, device->vba, dt); ERRCHK_CUDA_KERNEL(); return AC_SUCCESS; } AcResult synchronize(const Device device, const StreamType stream_type) { cudaSetDevice(device->id); if (stream_type == STREAM_ALL) { cudaDeviceSynchronize(); } else { cudaStreamSynchronize(device->streams[stream_type]); } return AC_SUCCESS; } static AcResult loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src, const size_t bytes, AcReal* dst) { cudaSetDevice(device->id); ERRCHK_CUDA( cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type])); return AC_SUCCESS; } static AcResult storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src, const size_t bytes, AcReal* dst) { cudaSetDevice(device->id); ERRCHK_CUDA( cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type])); return AC_SUCCESS; } AcResult copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh, const int3& src, const int3& dst, const int num_vertices) { const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info); const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]); } return AC_SUCCESS; } AcResult copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst, const int num_vertices, AcMesh* host_mesh) { const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config); const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { storeWithOffset(device, stream_type, &device->vba.in[i][src_idx], num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]); } return AC_SUCCESS; } AcResult copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src, Device dst_device, const int3& dst, const int num_vertices) { cudaSetDevice(src_device->id); const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config); const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id, &src_device->vba.in[i][src_idx], src_device->id, sizeof(src_device->vba.in[i][0]) * num_vertices, src_device->streams[stream_type])); } return AC_SUCCESS; } AcResult swapBuffers(const Device device) { cudaSetDevice(device->id); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { AcReal* tmp = device->vba.in[i]; device->vba.in[i] = device->vba.out[i]; device->vba.out[i] = tmp; } return AC_SUCCESS; } AcResult loadDeviceConstant(const Device device, const StreamType stream_type, const AcIntParam param, const int value) { cudaSetDevice(device->id); // CUDA 10 apparently creates only a single name for a device constant (d_mesh_info here) // and something like d_mesh_info.real_params[] cannot be directly accessed. // Therefore we have to obfuscate the code a bit and compute the offset address before // invoking cudaMemcpyToSymbol. const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info; ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice, device->streams[stream_type])); return AC_SUCCESS; } AcResult loadDeviceConstant(const Device device, const StreamType stream_type, const AcRealParam param, const AcReal value) { cudaSetDevice(device->id); const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info; ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice, device->streams[stream_type])); return AC_SUCCESS; } AcResult loadGlobalGrid(const Device device, const Grid grid) { cudaSetDevice(device->id); ERRCHK_CUDA_ALWAYS( cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice)); return AC_SUCCESS; } AcResult autoOptimize(const Device device) { cudaSetDevice(device->id); // RK3 const int3 start = (int3){NGHOST, NGHOST, NGHOST}; const int3 end = start + (int3){device->local_config.int_params[AC_nx], // device->local_config.int_params[AC_ny], // device->local_config.int_params[AC_nz]}; dim3 best_dims(0, 0, 0); float best_time = INFINITY; const int num_iterations = 10; for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) { for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) { for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) { if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z) break; if (x * y * z > MAX_THREADS_PER_BLOCK) break; if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK) break; if (((x * y * z) % WARP_SIZE) != 0) continue; const dim3 tpb(x, y, z); const int3 n = end - start; const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // (unsigned int)ceil(n.y / AcReal(tpb.y)), // (unsigned int)ceil(n.z / AcReal(tpb.z))); cudaDeviceSynchronize(); if (cudaGetLastError() != cudaSuccess) // resets the error if any continue; // printf("(%d, %d, %d)\n", x, y, z); cudaEvent_t tstart, tstop; cudaEventCreate(&tstart); cudaEventCreate(&tstop); cudaEventRecord(tstart); // ---------------------------------------- Timing start for (int i = 0; i < num_iterations; ++i) solve<2><<>>(start, end, device->vba, FLT_EPSILON); cudaEventRecord(tstop); // ----------------------------------------- Timing end cudaEventSynchronize(tstop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, tstart, tstop); ERRCHK_CUDA_KERNEL_ALWAYS(); if (milliseconds < best_time) { best_time = milliseconds; best_dims = tpb; } } } } #if VERBOSE_PRINTING printf( "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations); #endif /* FILE* fp = fopen("../config/rk3_tbdims.cuh", "w"); ERRCHK(fp); fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z); fclose(fp); */ rk3_tpb = best_dims; return AC_SUCCESS; } #if PACKED_DATA_TRANSFERS // Functions for calling packed data transfers #endif