diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h index 7aa9751..1441967 100644 --- a/include/astaroth_defines.h +++ b/include/astaroth_defines.h @@ -105,6 +105,7 @@ typedef enum { } ReductionType; typedef enum { STREAM_DEFAULT, NUM_STREAM_TYPES } Stream; +#define STREAM_ALL (-1) #define AC_GEN_ID(X) X typedef enum { diff --git a/include/astaroth_device.h b/include/astaroth_device.h index 48c8db2..929b591 100644 --- a/include/astaroth_device.h +++ b/include/astaroth_device.h @@ -88,7 +88,7 @@ AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* hos AcResult acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream, const VertexBufferHandle vtxbuf_handle, const int3 src, const int3 dst, - const int num_vertices, Device* dst_device); + const int num_vertices, Device dst_device); /** */ AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, @@ -106,15 +106,21 @@ AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Devi AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number, const int3 start, const int3 end, const AcReal dt); /** */ -AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream, const int3 start, +AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, const int3 start, const int3 end); + +/** */ +AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start, + const int3 end); + /** */ AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf_handle, AcReal* result); /** */ -AcResult acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype, - const VertexBufferHandle vec0, const VertexBufferHandle vec1, - const VertexBufferHandle vec2, AcReal* result); +AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const ReductionType rtype, + const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, + const VertexBufferHandle vtxbuf2, AcReal* result); #ifdef __cplusplus } // extern "C" diff --git a/src/core/device.cu b/src/core/device.cu index 5d7dd69..dbee2cc 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -16,7 +16,602 @@ You should have received a copy of the GNU General Public License along with Astaroth. If not, see . */ -#include "astaroth_device.h" + +/** + * @file + * \brief Brief info. + * + * Detailed info. + * + */ +#include "astaroth_device.cuh" + +#include "errchk.h" + +// Device info +#define REGISTERS_PER_THREAD (255) +#define MAX_REGISTERS_PER_BLOCK (65536) +#define MAX_THREADS_PER_BLOCK (1024) +#define WARP_SIZE (32) + +typedef struct { + AcReal* in[NUM_VTXBUF_HANDLES]; + AcReal* out[NUM_VTXBUF_HANDLES]; +} VertexBufferArray; + +__constant__ AcMeshInfo d_mesh_info; +__constant__ int3 d_multigpu_offset; +__constant__ Grid globalGrid; +#define DCONST_INT(X) (d_mesh_info.int_params[X]) +#define DCONST_INT3(X) (d_mesh_info.int3_params[X]) +#define DCONST_REAL(X) (d_mesh_info.real_params[X]) +#define DCONST_REAL3(X) (d_mesh_info.real3_params[X]) +#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy)) +#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy)) +#include "kernels/kernels.cuh" + +static dim3 rk3_tpb = (dim3){32, 1, 4}; + +#if PACKED_DATA_TRANSFERS // Defined in device.cuh +// #include "kernels/pack_unpack.cuh" +#endif struct device_s { + int id; + AcMeshInfo local_config; + + // Concurrency + cudaStream_t streams[NUM_STREAM_TYPES]; + + // Memory + VertexBufferArray vba; + AcReal* reduce_scratchpad; + AcReal* reduce_result; + +#if PACKED_DATA_TRANSFERS +// Declare memory for buffers needed for packed data transfers here +// AcReal* data_packing_buffer; +#endif }; + +static __global__ void +dummy_kernel(void) +{ +} + +AcResult +acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle) +{ + cudaSetDevice(id); + cudaDeviceReset(); + + // Create Device + struct device_s* device = (struct device_s*)malloc(sizeof(*device)); + ERRCHK_ALWAYS(device); + + device->id = id; + device->local_config = device_config; + + // Check that the code was compiled for the proper GPU architecture + printf("Trying to run a dummy kernel. If this fails, make sure that your\n" + "device supports the CUDA architecture you are compiling for.\n" + "Running dummy kernel... "); + fflush(stdout); + dummy_kernel<<<1, 1>>>(); + ERRCHK_CUDA_KERNEL_ALWAYS(); + printf("Success!\n"); + + // Concurrency + for (int i = 0; i < NUM_STREAM_TYPES; ++i) { + cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0); + } + + // Memory + const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config); + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes)); + ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes)); + } + ERRCHK_CUDA_ALWAYS( + cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config))); + ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal))); + +#if PACKED_DATA_TRANSFERS +// Allocate data required for packed transfers here (cudaMalloc) +#endif + + // Device constants + ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0, + cudaMemcpyHostToDevice)); + + // Multi-GPU offset. This is used to compute globalVertexIdx. + // Might be better to calculate this in astaroth.cu instead of here, s.t. + // everything related to the decomposition is limited to the multi-GPU layer + const int3 multigpu_offset = (int3){0, 0, device->id * device->local_config.int_params[AC_nz]}; + ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset, + sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice)); + + printf("Created device %d (%p)\n", device->id, device); + *device_handle = device; + + // Autoptimize + if (id == 0) { + autoOptimize(device); + } + + return AC_SUCCESS; +} + +AcResult +acDeviceDestroy(Device device) +{ + cudaSetDevice(device->id); + printf("Destroying device %d (%p)\n", device->id, device); + + // Memory + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + cudaFree(device->vba.in[i]); + cudaFree(device->vba.out[i]); + } + cudaFree(device->reduce_scratchpad); + cudaFree(device->reduce_result); + +#if PACKED_DATA_TRANSFERS +// Free data required for packed tranfers here (cudaFree) +#endif + + // Concurrency + for (int i = 0; i < NUM_STREAM_TYPES; ++i) { + cudaStreamDestroy(device->streams[i]); + } + + // Destroy Device + free(device); + return AC_SUCCESS; +} + +AcResult +acDevicePrintInfo(const Device device) +{ + const int device_id = device->id; + + cudaDeviceProp props; + cudaGetDeviceProperties(&props, device_id); + printf("--------------------------------------------------\n"); + printf("Device Number: %d\n", device_id); + const size_t bus_id_max_len = 128; + char bus_id[bus_id_max_len]; + cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id); + printf(" PCI bus ID: %s\n", bus_id); + printf(" Device name: %s\n", props.name); + printf(" Compute capability: %d.%d\n", props.major, props.minor); + + // Compute + printf(" Compute\n"); + printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz + printf(" Stream processors: %d\n", props.multiProcessorCount); + printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio); + printf( + " Compute mode: %d\n", + (int)props + .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0 + // Memory + printf(" Global memory\n"); + printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000)); + printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth); + printf(" Peak Memory Bandwidth (GiB/s): %f\n", + 2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.)); + printf(" ECC enabled: %d\n", props.ECCEnabled); + + // Memory usage + size_t free_bytes, total_bytes; + cudaMemGetInfo(&free_bytes, &total_bytes); + const size_t used_bytes = total_bytes - free_bytes; + printf(" Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024)); + printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024)); + printf(" Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024)); + printf(" Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024)); + printf(" Caches\n"); + printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported); + printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported); + printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024)); + // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error + // MV: while compiling in TIARA gp cluster. Therefore commeted out. + //!! printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024)); + //!! printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024)); + printf(" Other\n"); + printf(" Warp size: %d\n", props.warpSize); + // printf(" Single to double perf. ratio: %dx\n", + // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA + // versions + printf(" Stream priorities supported: %d\n", props.streamPrioritiesSupported); + printf("--------------------------------------------------\n"); + + return AC_SUCCESS; +} + +AcResult +autoOptimize(const Device device) +{ + cudaSetDevice(device->id); + + // RK3 + const int3 start = (int3){NGHOST, NGHOST, NGHOST}; + const int3 end = start + (int3){device->local_config.int_params[AC_nx], // + device->local_config.int_params[AC_ny], // + device->local_config.int_params[AC_nz]}; + + dim3 best_dims(0, 0, 0); + float best_time = INFINITY; + const int num_iterations = 10; + + for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) { + for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) { + for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) { + + if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z) + break; + if (x * y * z > MAX_THREADS_PER_BLOCK) + break; + + if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK) + break; + + if (((x * y * z) % WARP_SIZE) != 0) + continue; + + const dim3 tpb(x, y, z); + const int3 n = end - start; + const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // + (unsigned int)ceil(n.y / AcReal(tpb.y)), // + (unsigned int)ceil(n.z / AcReal(tpb.z))); + + cudaDeviceSynchronize(); + if (cudaGetLastError() != cudaSuccess) // resets the error if any + continue; + + // printf("(%d, %d, %d)\n", x, y, z); + + cudaEvent_t tstart, tstop; + cudaEventCreate(&tstart); + cudaEventCreate(&tstop); + + cudaEventRecord(tstart); // ---------------------------------------- Timing start + + for (int i = 0; i < num_iterations; ++i) + solve<2><<>>(start, end, device->vba, FLT_EPSILON); + + cudaEventRecord(tstop); // ----------------------------------------- Timing end + cudaEventSynchronize(tstop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, tstart, tstop); + + ERRCHK_CUDA_KERNEL_ALWAYS(); + if (milliseconds < best_time) { + best_time = milliseconds; + best_dims = tpb; + } + } + } + } +#if VERBOSE_PRINTING + printf( + "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n", + best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations); +#endif + /* + FILE* fp = fopen("../config/rk3_tbdims.cuh", "w"); + ERRCHK(fp); + fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z); + fclose(fp); + */ + + rk3_tpb = best_dims; + return AC_SUCCESS; +} + +AcResult +acDeviceSynchronizeStream(const Device device, const Stream stream) +{ + cudaSetDevice(device->id); + if (stream == STREAM_ALL) { + cudaDeviceSynchronize(); + } + else { + cudaStreamSynchronize(device->streams[stream]); + } + return AC_SUCCESS; +} + +AcResult +acDeviceSwapBuffers(const Device device) +{ + cudaSetDevice(device->id); + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + AcReal* tmp = device->vba.in[i]; + device->vba.in[i] = device->vba.out[i]; + device->vba.out[i] = tmp; + } + return AC_SUCCESS; +} + +AcResult +acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param, + const AcReal value) +{ + cudaSetDevice(device->id); + const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info; + ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, + cudaMemcpyHostToDevice, device->streams[stream])); + return AC_SUCCESS; +} + +static AcResult +load_with_offset(const Device device, const Stream stream, const AcReal* src, const size_t bytes, + AcReal* dst) +{ + cudaSetDevice(device->id); + ERRCHK_CUDA( // + cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream]) // + ); + return AC_SUCCESS; +} + +AcResult +acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMeshhost_mesh, + const VertexBufferHandle vtxbuf_handle, const int3src, + const int3dst, const int num_vertices) +{ + cudaSetDevice(device->id); + const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info); + const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config); + load_with_offset(device, stream, &host_mesh.vertex_buffer[vtxbuf_handle][src_idx], + num_vertices * sizeof(AcReal), &device->vba.in[vtxbuf_handle][dst_idx]); + + return AC_SUCCESS; +} + +AcResult +acDeviceLoadMeshWithOffset(const Device device, const Stream stream, const AcMeshhost_mesh, + const int3src, const int3dst, const int num_vertices) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, (VertexBufferHandle)i, src, + dst, num_vertices); + } + return AC_SUCCESS; +} + +AcResult +acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh, + const VertexBufferHandle vtxbuf_handle) +{ + int3 src = (int3){0, 0, 0}; + int3 dst = src; + const size_t num_vertices = acVertexBufferSize(device->local_config); + acLoadVertexBufferWithOffset(device, stream, host_mesh, vtxbuf_handle, src, dst, num_vertices); + + return AC_SUCCESS; +} + +AcResult +acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDeviceLoadVertexBuffer(device, stream, host_mesh, (VertexBufferHandle)i); + } + + return AC_SUCCESS; +} + +static AcResult +store_with_offset(const Device device, const Stream stream, const AcReal* src, const size_t bytes, + AcReal* dst) +{ + cudaSetDevice(device->id); + ERRCHK_CUDA( // + cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream]) // + ); + return AC_SUCCESS; +} + +AcResult +acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, const int3 src, + const int3 dst, const int num_vertices, AcMesh* host_mesh) +{ + cudaSetDevice(device->id); + const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config); + const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info); + store_with_offset(device, stream, &device->vba.in[vtxbuf_handle][src_idx], + num_vertices * sizeof(AcReal), + &host_mesh->vertex_buffer[vtxbuf_handle][dst_idx]); + + return AC_SUCCESS; +} + +AcResult +acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src, + const int3 dst, const int num_vertices, AcMesh* host_mesh) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDeviceStoreVertexBufferWithOffset(device, stream, (VertexBufferHandle)i, src, dst, + num_vertices, host_mesh); + } + + return AC_SUCCESS; +} + +AcResult +acDeviceStoreVertexBuffer(const Device device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh) +{ + int3 src = (int3){0, 0, 0}; + int3 dst = src; + const size_t num_vertices = acVertexBufferSize(device->local_config); + + acDeviceStoreVertexBufferWithOffset(device, stream, vtxbuf_handle, src, dst, num_vertices, + host_mesh); + + return AC_SUCCESS; +} + +AcResult +acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDeviceStoreVertexBuffer(device, stream, (VertexBufferHandle)i, host_mesh); + } + + return AC_SUCCESS; +} + +AcResult +acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, const int3src, + const int3dst, const int num_vertices, Device dst_device) +{ + cudaSetDevice(src_device->id); + const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config); + const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config); + + ERRCHK_CUDA( // + cudaMemcpyPeerAsync(&dst_device->vba.in[vtxbuf_handle][dst_idx], dst_device->id, + &src_device->vba.in[vtxbuf_handle][src_idx], src_device->id, + sizeof(src_device->vba.in[vtxbuf_handle][0]) * num_vertices, + src_device->streams[stream]) // + ); + return AC_SUCCESS; +} + +AcResult +acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, const int3src, + const int3dst, const int num_vertices, Device dst_device) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst, + num_vertices, dst_device); + } + return AC_SUCCESS; +} + +AcResult +acDeviceTransferVertexBuffer(const Device src_device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, Device dst_device) +{ + int3 src = (int3){0, 0, 0}; + int3 dst = src; + const size_t num_vertices = acVertexBufferSize(device->local_config); + + acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst, + num_vertices, dst_device); + return AC_SUCCESS; +} + +AcResult +acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDeviceTransferVertexBuffer(src_device, stream, vtxbuf_handle, (VertexBufferHandle)i, src, + dst, num_vertices, dst_device); + } +} + +AcResult +acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number, + const int3 start, const int3 end, const AcReal dt) +{ + cudaSetDevice(device->id); + const cudaStream_t stream = device->streams[stream]; + + const dim3 tpb = rk3_tpb; + + const int3 n = end - start; + const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // + (unsigned int)ceil(n.y / AcReal(tpb.y)), // + (unsigned int)ceil(n.z / AcReal(tpb.z))); + + if (step_number == 0) + solve<0><<>>(start, end, device->vba, dt); + else if (step_number == 1) + solve<1><<>>(start, end, device->vba, dt); + else + solve<2><<>>(start, end, device->vba, dt); + + ERRCHK_CUDA_KERNEL(); + + return AC_SUCCESS; +} + +AcResult +acDevicePeriodicBoundcondStep(const Device device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, const int3 start, + const int3 end) +{ + cudaSetDevice(device->id); + const cudaStream_t stream = device->streams[stream]; + + const dim3 tpb(8, 2, 8); + const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x), + (unsigned int)ceil((end.y - start.y) / (float)tpb.y), + (unsigned int)ceil((end.z - start.z) / (float)tpb.z)); + + kernel_periodic_boundconds<<>>(start, end, device->vba.in[vtxbuf_handle]); + ERRCHK_CUDA_KERNEL(); + + return AC_SUCCESS; +} + +AcResult +acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start, + const int3 end) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDevicePeriodicBoundcondStep(device, stream, (VertexBufferHandle)i, start, end); + } + return AC_SUCCESS; +} + +AcResult +acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype, + const VertexBufferHandle vtxbuf_handle, AcReal* result) +{ + cudaSetDevice(device->id); + + const int3 start = (int3){device->local_config.int_params[AC_nx_min], + device->local_config.int_params[AC_ny_min], + device->local_config.int_params[AC_nz_min]}; + + const int3 end = (int3){device->local_config.int_params[AC_nx_max], + device->local_config.int_params[AC_ny_max], + device->local_config.int_params[AC_nz_max]}; + + *result = reduce_scal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle], + device->reduce_scratchpad, device->reduce_result); + return AC_SUCCESS; +} + +AcResult +acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype, + const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, + const VertexBufferHandle vtxbuf2, AcReal* result) +{ + cudaSetDevice(device->id); + + const int3 start = (int3){device->local_config.int_params[AC_nx_min], + device->local_config.int_params[AC_ny_min], + device->local_config.int_params[AC_nz_min]}; + + const int3 end = (int3){device->local_config.int_params[AC_nx_max], + device->local_config.int_params[AC_ny_max], + device->local_config.int_params[AC_nz_max]}; + + *result = reduce_vec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0], + device->vba.in[vtxbuf1], device->vba.in[vtxbuf2], + device->reduce_scratchpad, device->reduce_result); + return AC_SUCCESS; +} + +#if PACKED_DATA_TRANSFERS +// Functions for calling packed data transfers +#endif diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh index e8e4042..7a4e54b 100644 --- a/src/core/kernels/kernels.cuh +++ b/src/core/kernels/kernels.cuh @@ -74,18 +74,6 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf) vtxbuf[dst_idx] = vtxbuf[src_idx]; } -void -periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf) -{ - const dim3 tpb(8, 2, 8); - const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x), - (unsigned int)ceil((end.y - start.y) / (float)tpb.y), - (unsigned int)ceil((end.z - start.z) / (float)tpb.z)); - - kernel_periodic_boundconds<<>>(start, end, vtxbuf); - ERRCHK_CUDA_KERNEL(); -} - /////////////////////////////////////////////////////////////////////////////////////////////////// #include