Merge branch 'master' into sink_20190723

This commit is contained in:
Miikka Vaisala
2019-09-16 10:57:15 +08:00
16 changed files with 1100 additions and 40 deletions

View File

@@ -50,6 +50,7 @@ endif ()
## Include directories
include_directories(.)
include_directories(include)
include_directories(${CMAKE_BINARY_DIR})
## Subdirectories
add_subdirectory(src/core) # The core library

5
acc/pc_mhd_solver/.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
build
testbin
# Except this file
!.gitignore

View File

@@ -0,0 +1,75 @@
#include "stencil_definition.sdh"
Preprocessed Scalar
value(in ScalarField vertex)
{
return vertex[vertexIdx];
}
Preprocessed Vector
gradient(in ScalarField vertex)
{
return (Vector){derx(vertexIdx, vertex), dery(vertexIdx, vertex), derz(vertexIdx, vertex)};
}
#if LUPWD
Preprocessed Scalar
der6x_upwd(in ScalarField vertex)
{
Scalar inv_ds = AC_inv_dsx;
return (Scalar){Scalar(1.0 / 60.0) * inv_ds *
(-Scalar(20.0) * vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z] +
Scalar(15.0) * (vertex[vertexIdx.x + 1, vertexIdx.y, vertexIdx.z] +
vertex[vertexIdx.x - 1, vertexIdx.y, vertexIdx.z]) -
Scalar(6.0) * (vertex[vertexIdx.x + 2, vertexIdx.y, vertexIdx.z] +
vertex[vertexIdx.x - 2, vertexIdx.y, vertexIdx.z]) +
vertex[vertexIdx.x + 3, vertexIdx.y, vertexIdx.z] +
vertex[vertexIdx.x - 3, vertexIdx.y, vertexIdx.z])};
}
Preprocessed Scalar
der6y_upwd(in ScalarField vertex)
{
Scalar inv_ds = AC_inv_dsy;
return (Scalar){Scalar(1.0 / 60.0) * inv_ds *
(-Scalar(20.0) * vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z] +
Scalar(15.0) * (vertex[vertexIdx.x, vertexIdx.y + 1, vertexIdx.z] +
vertex[vertexIdx.x, vertexIdx.y - 1, vertexIdx.z]) -
Scalar(6.0) * (vertex[vertexIdx.x, vertexIdx.y + 2, vertexIdx.z] +
vertex[vertexIdx.x, vertexIdx.y - 2, vertexIdx.z]) +
vertex[vertexIdx.x, vertexIdx.y + 3, vertexIdx.z] +
vertex[vertexIdx.x, vertexIdx.y - 3, vertexIdx.z])};
}
Preprocessed Scalar
der6z_upwd(in ScalarField vertex)
{
Scalar inv_ds = AC_inv_dsz;
return (Scalar){Scalar(1.0 / 60.0) * inv_ds *
(-Scalar(20.0) * vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z] +
Scalar(15.0) * (vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z + 1] +
vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z - 1]) -
Scalar(6.0) * (vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z + 2] +
vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z - 2]) +
vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z + 3] +
vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z - 3])};
}
#endif
Preprocessed Matrix
hessian(in ScalarField vertex)
{
Matrix hessian;
hessian.row[0] = (Vector){derxx(vertexIdx, vertex), derxy(vertexIdx, vertex),
derxz(vertexIdx, vertex)};
hessian.row[1] = (Vector){hessian.row[0].y, deryy(vertexIdx, vertex), deryz(vertexIdx, vertex)};
hessian.row[2] = (Vector){hessian.row[0].z, hessian.row[1].z, derzz(vertexIdx, vertex)};
return hessian;
}

View File

@@ -0,0 +1,159 @@
#define LDENSITY (1)
#define LHYDRO (1)
#define LMAGNETIC (1)
#define LENTROPY (1)
#define LTEMPERATURE (0)
#define LFORCING (1)
#define LUPWD (1)
#define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
// Int params
uniform int AC_max_steps;
uniform int AC_save_steps;
uniform int AC_bin_steps;
uniform int AC_bc_type;
// Real params
uniform Scalar AC_dt;
// Spacing
uniform Scalar AC_dsx;
uniform Scalar AC_dsy;
uniform Scalar AC_dsz;
uniform Scalar AC_dsmin;
// physical grid
uniform Scalar AC_xlen;
uniform Scalar AC_ylen;
uniform Scalar AC_zlen;
uniform Scalar AC_xorig;
uniform Scalar AC_yorig;
uniform Scalar AC_zorig;
// Physical units
uniform Scalar AC_unit_density;
uniform Scalar AC_unit_velocity;
uniform Scalar AC_unit_length;
// properties of gravitating star
uniform Scalar AC_star_pos_x;
uniform Scalar AC_star_pos_y;
uniform Scalar AC_star_pos_z;
uniform Scalar AC_M_star;
// Run params
uniform Scalar AC_cdt;
uniform Scalar AC_cdtv;
uniform Scalar AC_cdts;
uniform Scalar AC_nu_visc;
uniform Scalar AC_cs_sound;
uniform Scalar AC_eta;
uniform Scalar AC_mu0;
uniform Scalar AC_cp_sound;
uniform Scalar AC_gamma;
uniform Scalar AC_cv_sound;
uniform Scalar AC_lnT0;
uniform Scalar AC_lnrho0;
uniform Scalar AC_zeta;
uniform Scalar AC_trans;
// Other
uniform Scalar AC_bin_save_t;
// Initial condition params
uniform Scalar AC_ampl_lnrho;
uniform Scalar AC_ampl_uu;
uniform Scalar AC_angl_uu;
uniform Scalar AC_lnrho_edge;
uniform Scalar AC_lnrho_out;
// Forcing parameters. User configured.
uniform Scalar AC_forcing_magnitude;
uniform Scalar AC_relhel;
uniform Scalar AC_kmin;
uniform Scalar AC_kmax;
// Forcing parameters. Set by the generator.
uniform Scalar AC_forcing_phase;
uniform Scalar AC_k_forcex;
uniform Scalar AC_k_forcey;
uniform Scalar AC_k_forcez;
uniform Scalar AC_kaver;
uniform Scalar AC_ff_hel_rex;
uniform Scalar AC_ff_hel_rey;
uniform Scalar AC_ff_hel_rez;
uniform Scalar AC_ff_hel_imx;
uniform Scalar AC_ff_hel_imy;
uniform Scalar AC_ff_hel_imz;
// Additional helper params // (deduced from other params do not set these directly!)
uniform Scalar AC_G_CONST;
uniform Scalar AC_GM_star;
uniform Scalar AC_sq2GM_star;
uniform Scalar AC_cs2_sound;
uniform Scalar AC_inv_dsx;
uniform Scalar AC_inv_dsy;
uniform Scalar AC_inv_dsz;
// PC-style helical forcing with profiles
uniform Vector AC_kk;
uniform Vector AC_coef1;
uniform Vector AC_coef2;
uniform Vector AC_coef3;
uniform Vector AC_fda;
uniform Scalar AC_phase;
uniform Scalar AC_fact;
uniform Scalar AC_k1_ff;
uniform ScalarArray AC_profx;
uniform ScalarArray AC_profy;
uniform ScalarArray AC_profz;
uniform ScalarArray AC_profx_hel;
uniform ScalarArray AC_profy_hel;
uniform ScalarArray AC_profz_hel;
uniform int AC_iforcing_zsym;
uniform ScalarArray AC_0;
uniform ScalarArray AC_1;
uniform ScalarArray AC_2;
uniform ScalarArray AC_3;
uniform ScalarArray AC_4;
uniform ScalarArray AC_5;
uniform ScalarArray AC_6;
uniform ScalarArray AC_7;
uniform ScalarArray AC_8;
uniform ScalarArray AC_9;
uniform ScalarArray AC_10;
uniform ScalarArray AC_11;
uniform ScalarArray AC_12;
uniform ScalarArray AC_13;
uniform ScalarArray AC_14;
uniform ScalarArray AC_15;
uniform ScalarArray AC_16;
uniform ScalarArray AC_17;
uniform ScalarArray AC_18;
uniform ScalarArray AC_19;
/*
* =============================================================================
* User-defined vertex buffers
* =============================================================================
*/
#if LENTROPY
uniform ScalarField VTXBUF_LNRHO;
uniform ScalarField VTXBUF_UUX;
uniform ScalarField VTXBUF_UUY;
uniform ScalarField VTXBUF_UUZ;
uniform ScalarField VTXBUF_AX;
uniform ScalarField VTXBUF_AY;
uniform ScalarField VTXBUF_AZ;
uniform ScalarField VTXBUF_ENTROPY;
#elif LMAGNETIC
uniform ScalarField VTXBUF_LNRHO;
uniform ScalarField VTXBUF_UUX;
uniform ScalarField VTXBUF_UUY;
uniform ScalarField VTXBUF_UUZ;
uniform ScalarField VTXBUF_AX;
uniform ScalarField VTXBUF_AY;
uniform ScalarField VTXBUF_AZ;
#elif LHYDRO
uniform ScalarField VTXBUF_LNRHO;
uniform ScalarField VTXBUF_UUX;
uniform ScalarField VTXBUF_UUY;
uniform ScalarField VTXBUF_UUZ;
#else
uniform ScalarField VTXBUF_LNRHO;
#endif

View File

@@ -0,0 +1,358 @@
#include "stencil_definition.sdh"
Vector
value(in VectorField uu)
{
return (Vector){value(uu.x), value(uu.y), value(uu.z)};
}
#if LUPWD
Scalar
upwd_der6(in VectorField uu, in ScalarField lnrho)
{
Scalar uux = fabs(value(uu).x);
Scalar uuy = fabs(value(uu).y);
Scalar uuz = fabs(value(uu).z);
return (Scalar){uux * der6x_upwd(lnrho) + uuy * der6y_upwd(lnrho) + uuz * der6z_upwd(lnrho)};
}
#endif
Matrix
gradients(in VectorField uu)
{
return (Matrix){gradient(uu.x), gradient(uu.y), gradient(uu.z)};
}
Scalar
continuity(in VectorField uu, in ScalarField lnrho)
{
return -dot(value(uu), gradient(lnrho))
#if LUPWD
// This is a corrective hyperdiffusion term for upwinding.
+ upwd_der6(uu, lnrho)
#endif
- divergence(uu);
}
#if LENTROPY
Vector
momentum(in VectorField uu, in ScalarField lnrho, in ScalarField ss, in VectorField aa)
{
const Matrix S = stress_tensor(uu);
const Scalar cs2 = AC_cs2_sound * exp(AC_gamma * value(ss) / AC_cp_sound +
(AC_gamma - 1) * (value(lnrho) - AC_lnrho0));
const Vector j = (Scalar(1.) / AC_mu0) *
(gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
const Vector B = curl(aa);
// TODO: DOES INTHERMAL VERSTION INCLUDE THE MAGNETIC FIELD?
const Scalar inv_rho = Scalar(1.) / exp(value(lnrho));
// Regex replace CPU constants with get\(AC_([a-zA-Z_0-9]*)\)
// \1
const Vector mom = -mul(gradients(uu), value(uu)) -
cs2 * ((Scalar(1.) / AC_cp_sound) * gradient(ss) + gradient(lnrho)) +
inv_rho * cross(j, B) +
AC_nu_visc *
(laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
Scalar(2.) * mul(S, gradient(lnrho))) +
AC_zeta * gradient_of_divergence(uu);
return mom;
}
#elif LTEMPERATURE
Vector
momentum(in VectorField uu, in ScalarField lnrho, in ScalarField tt)
{
Vector mom;
const Matrix S = stress_tensor(uu);
const Vector pressure_term = (AC_cp_sound - AC_cv_sound) *
(gradient(tt) + value(tt) * gradient(lnrho));
mom = -mul(gradients(uu), value(uu)) - pressure_term +
AC_nu_visc * (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
Scalar(2.) * mul(S, gradient(lnrho))) +
AC_zeta * gradient_of_divergence(uu);
#if LGRAVITY
mom = mom - (Vector){0, 0, -10.0};
#endif
return mom;
}
#else
Vector
momentum(in VectorField uu, in ScalarField lnrho)
{
Vector mom;
const Matrix S = stress_tensor(uu);
// Isothermal: we have constant speed of sound
mom = -mul(gradients(uu), value(uu)) - AC_cs2_sound * gradient(lnrho) +
AC_nu_visc * (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
Scalar(2.) * mul(S, gradient(lnrho))) +
AC_zeta * gradient_of_divergence(uu);
#if LGRAVITY
mom = mom - (Vector){0, 0, -10.0};
#endif
return mom;
}
#endif
Vector
induction(in VectorField uu, in VectorField aa)
{
// Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
// x A)) in order to avoid taking the first derivative twice (did the math,
// yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
// u cross B - AC_eta * AC_mu0 * (AC_mu0^-1 * [- laplace A + grad div A ])
const Vector B = curl(aa);
const Vector grad_div = gradient_of_divergence(aa);
const Vector lap = laplace_vec(aa);
// Note, AC_mu0 is cancelled out
const Vector ind = cross(value(uu), B) - AC_eta * (grad_div - lap);
return ind;
}
#if LENTROPY
Scalar
lnT(in ScalarField ss, in ScalarField lnrho)
{
const Scalar lnT = AC_lnT0 + AC_gamma * value(ss) / AC_cp_sound +
(AC_gamma - Scalar(1.)) * (value(lnrho) - AC_lnrho0);
return lnT;
}
// Nabla dot (K nabla T) / (rho T)
Scalar
heat_conduction(in ScalarField ss, in ScalarField lnrho)
{
const Scalar inv_AC_cp_sound = AcReal(1.) / AC_cp_sound;
const Vector grad_ln_chi = -gradient(lnrho);
const Scalar first_term = AC_gamma * inv_AC_cp_sound * laplace(ss) +
(AC_gamma - AcReal(1.)) * laplace(lnrho);
const Vector second_term = AC_gamma * inv_AC_cp_sound * gradient(ss) +
(AC_gamma - AcReal(1.)) * gradient(lnrho);
const Vector third_term = AC_gamma * (inv_AC_cp_sound * gradient(ss) + gradient(lnrho)) +
grad_ln_chi;
const Scalar chi = AC_THERMAL_CONDUCTIVITY / (exp(value(lnrho)) * AC_cp_sound);
return AC_cp_sound * chi * (first_term + dot(second_term, third_term));
}
Scalar
heating(const int i, const int j, const int k)
{
return 1;
}
Scalar
entropy(in ScalarField ss, in VectorField uu, in ScalarField lnrho, in VectorField aa)
{
const Matrix S = stress_tensor(uu);
const Scalar inv_pT = Scalar(1.) / (exp(value(lnrho)) * exp(lnT(ss, lnrho)));
const Vector j = (Scalar(1.) / AC_mu0) *
(gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
const Scalar RHS = H_CONST - C_CONST + AC_eta * (AC_mu0)*dot(j, j) +
Scalar(2.) * exp(value(lnrho)) * AC_nu_visc * contract(S) +
AC_zeta * exp(value(lnrho)) * divergence(uu) * divergence(uu);
return -dot(value(uu), gradient(ss)) + inv_pT * RHS + heat_conduction(ss, lnrho);
}
#endif
#if LTEMPERATURE
Scalar
heat_transfer(in VectorField uu, in ScalarField lnrho, in ScalarField tt)
{
const Matrix S = stress_tensor(uu);
const Scalar heat_diffusivity_k = 0.0008; // 8e-4;
return -dot(value(uu), gradient(tt)) + heat_diffusivity_k * laplace(tt) +
heat_diffusivity_k * dot(gradient(lnrho), gradient(tt)) +
AC_nu_visc * contract(S) * (Scalar(1.) / AC_cv_sound) -
(AC_gamma - 1) * value(tt) * divergence(uu);
}
#endif
#if LFORCING
Vector
simple_vortex_forcing(Vector a, Vector b, Scalar magnitude)
{
return magnitude * cross(normalized(b - a), (Vector){0, 0, 1}); // Vortex
}
Vector
simple_outward_flow_forcing(Vector a, Vector b, Scalar magnitude)
{
return magnitude * (1 / length(b - a)) * normalized(b - a); // Outward flow
}
// The Pencil Code forcing_hel_noshear(), manual Eq. 222, inspired forcing function with adjustable
// helicity
Vector
helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vector ff_im, Scalar phi)
{
// JP: This looks wrong:
// 1) Should it be AC_dsx * AC_nx instead of AC_dsx * AC_ny?
// 2) Should you also use globalGrid.n instead of the local n?
// MV: You are rigth. Made a quickfix. I did not see the error because multigpu is split
// in z direction not y direction.
// 3) Also final point: can we do this with vectors/quaternions instead?
// Tringonometric functions are much more expensive and inaccurate/
// MV: Good idea. No an immediate priority.
// Fun related article:
// https://randomascii.wordpress.com/2014/10/09/intel-underestimates-error-bounds-by-1-3-quintillion/
xx.x = xx.x * (2.0 * M_PI / (AC_dsx * globalGridN.x));
xx.y = xx.y * (2.0 * M_PI / (AC_dsy * globalGridN.y));
xx.z = xx.z * (2.0 * M_PI / (AC_dsz * globalGridN.z));
Scalar cos_phi = cos(phi);
Scalar sin_phi = sin(phi);
Scalar cos_k_dot_x = cos(dot(k_force, xx));
Scalar sin_k_dot_x = sin(dot(k_force, xx));
// Phase affect only the x-component
// Scalar real_comp = cos_k_dot_x;
// Scalar imag_comp = sin_k_dot_x;
Scalar real_comp_phase = cos_k_dot_x * cos_phi - sin_k_dot_x * sin_phi;
Scalar imag_comp_phase = cos_k_dot_x * sin_phi + sin_k_dot_x * cos_phi;
Vector force = (Vector){ff_re.x * real_comp_phase - ff_im.x * imag_comp_phase,
ff_re.y * real_comp_phase - ff_im.y * imag_comp_phase,
ff_re.z * real_comp_phase - ff_im.z * imag_comp_phase};
return force;
}
Vector
forcing_OLD(int3 globalVertexIdx, Scalar dt)
{
Vector a = Scalar(.5) * (Vector){globalGridN.x * AC_dsx, globalGridN.y * AC_dsy,
globalGridN.z * AC_dsz}; // source (origin)
Vector xx = (Vector){(globalVertexIdx.x - DCONST(AC_nx_min)) * AC_dsx,
(globalVertexIdx.y - DCONST(AC_ny_min)) * AC_dsy,
(globalVertexIdx.z - DCONST(AC_nz_min)) * AC_dsz}; // sink (current index)
const Scalar cs2 = AC_cs2_sound;
const Scalar cs = sqrt(cs2);
// Placeholders until determined properly
Scalar magnitude = AC_forcing_magnitude;
Scalar phase = AC_forcing_phase;
Vector k_force = (Vector){AC_k_forcex, AC_k_forcey, AC_k_forcez};
Vector ff_re = (Vector){AC_ff_hel_rex, AC_ff_hel_rey, AC_ff_hel_rez};
Vector ff_im = (Vector){AC_ff_hel_imx, AC_ff_hel_imy, AC_ff_hel_imz};
// Determine that forcing funtion type at this point.
// Vector force = simple_vortex_forcing(a, xx, magnitude);
// Vector force = simple_outward_flow_forcing(a, xx, magnitude);
Vector force = helical_forcing(magnitude, k_force, xx, ff_re, ff_im, phase);
// Scaling N = magnitude*cs*sqrt(k*cs/dt) * dt
const Scalar NN = cs * sqrt(AC_kaver * cs);
// MV: Like in the Pencil Code. I don't understandf the logic here.
force.x = sqrt(dt) * NN * force.x;
force.y = sqrt(dt) * NN * force.y;
force.z = sqrt(dt) * NN * force.z;
if (is_valid(force)) {
return force;
}
else {
return (Vector){0, 0, 0};
}
}
// PC-style helical forcing with support for profiles
Vector
forcing(int3 vertexIdx, int3 globalVertexIdx, Scalar dt, ScalarArray profx, ScalarArray profy,
ScalarArray profz, ScalarArray profx_hel, ScalarArray profy_hel, ScalarArray profz_hel)
{
Vector pos = (Vector){(globalVertexIdx.x - DCONST(AC_nx_min)) * AC_dsx,
(globalVertexIdx.y - DCONST(AC_ny_min)) * AC_dsy,
(globalVertexIdx.z - DCONST(AC_nz_min)) * AC_dsz};
Complex fx = AC_fact * exp(Complex(0, AC_kk.x * AC_k1_ff * pos.z + AC_phase));
Complex fy = exp(Complex(0, AC_kk.y * AC_k1_ff * pos.y));
Complex fz;
if (AC_iforcing_zsym == 0) {
fz = exp(Complex(0., AC_kk.z * AC_k1_ff * pos.z));
}
else if (AC_iforcing_zsym == 1) {
fz = Complex(cos(AC_kk.z * AC_k1_ff * pos.z), 0);
}
else if (AC_iforcing_zsym == -1) {
fz = Complex(sin(AC_kk.z * AC_k1_ff * pos.z), 0);
}
else {
// Failure
}
Complex fxyz = fx * fy * fz;
// TODO recheck indices
Scalar force_ampl = profx[vertexIdx.x - NGHOST] * profy[vertexIdx.y] * profz[vertexIdx.z];
Scalar prof_hel_ampl = profx_hel[vertexIdx.x - NGHOST] * profy_hel[vertexIdx.y] *
profz_hel[vertexIdx.z];
return force_ampl * AC_fda * (Complex(AC_coef1.x, prof_hel_ampl * AC_coef2.x) * fxyz).x;
}
#endif // LFORCING
// Declare input and output arrays using locations specified in the
// array enum in astaroth.h
in ScalarField lnrho(VTXBUF_LNRHO);
out ScalarField out_lnrho(VTXBUF_LNRHO);
in VectorField uu(VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
out VectorField out_uu(VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
#if LMAGNETIC
in VectorField aa(VTXBUF_AX, VTXBUF_AY, VTXBUF_AZ);
out VectorField out_aa(VTXBUF_AX, VTXBUF_AY, VTXBUF_AZ);
#endif
#if LENTROPY
in ScalarField ss(VTXBUF_ENTROPY);
out ScalarField out_ss(VTXBUF_ENTROPY);
#endif
#if LTEMPERATURE
in ScalarField tt(VTXBUF_TEMPERATURE);
out ScalarField out_tt(VTXBUF_TEMPERATURE);
#endif
Kernel void
solve()
{
Scalar dt = AC_dt;
out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt);
#if LMAGNETIC
out_aa = rk3(out_aa, aa, induction(uu, aa), dt);
#endif
#if LENTROPY
out_uu = rk3(out_uu, uu, momentum(uu, lnrho, ss, aa), dt);
out_ss = rk3(out_ss, ss, entropy(ss, uu, lnrho, aa), dt);
#elif LTEMPERATURE
out_uu = rk3(out_uu, uu, momentum(uu, lnrho, tt), dt);
out_tt = rk3(out_tt, tt, heat_transfer(uu, lnrho, tt), dt);
#else
out_uu = rk3(out_uu, uu, momentum(uu, lnrho), dt);
#endif
#if LFORCING
if (step_number == 2) {
out_uu = out_uu + forcing(vertexIdx, globalVertexIdx, dt, AC_profx, AC_profy, AC_profz,
AC_profx_hel, AC_profy_hel, AC_profz_hel);
}
#endif
}

View File

@@ -15,8 +15,10 @@ L [a-zA-Z_]
"void" { return VOID; } /* Rest of the types inherited from C */
"int" { return INT; }
"int3" { return INT3; }
"Complex" { return COMPLEX; }
"ScalarField" { return SCALARFIELD; }
"VectorField" { return VECTOR; }
"ScalarArray" { return SCALARARRAY; }
"Kernel" { return KERNEL; } /* Function specifiers */
"Preprocessed" { return PREPROCESSED; }

View File

@@ -16,8 +16,8 @@ int yyget_lineno();
%token CONSTANT IN OUT UNIFORM
%token IDENTIFIER NUMBER
%token RETURN
%token SCALAR VECTOR MATRIX SCALARFIELD
%token VOID INT INT3
%token SCALAR VECTOR MATRIX SCALARFIELD SCALARARRAY
%token VOID INT INT3 COMPLEX
%token IF ELSE FOR WHILE ELIF
%token LEQU LAND LOR LLEQU
%token KERNEL PREPROCESSED
@@ -210,6 +210,8 @@ type_specifier: VOID
| VECTOR { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = VECTOR; }
| MATRIX { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = MATRIX; }
| SCALARFIELD { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = SCALARFIELD; }
| SCALARARRAY { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = SCALARARRAY; }
| COMPLEX { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = COMPLEX; }
;
identifier: IDENTIFIER { $$ = astnode_create(NODE_IDENTIFIER, NULL, NULL); astnode_set_buffer(yytext, $$); }

View File

@@ -61,6 +61,8 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
[VECTOR] = "AcReal3",
[MATRIX] = "AcMatrix",
[SCALARFIELD] = "AcReal",
[SCALARARRAY] = "const AcReal* __restrict__",
[COMPLEX] = "acComplex",
// Type qualifiers
[KERNEL] = "template <int step_number> static __global__",
//__launch_bounds__(RK_THREADBLOCK_SIZE,
@@ -380,20 +382,13 @@ traverse(const ASTNode* node)
if (handle >= 0) { // The variable exists in the symbol table
const Symbol* symbol = &symbol_table[handle];
// if (symbol->type_qualifier == OUT) {
// printf("%s%s", inout_name_prefix, symbol->identifier);
//}
if (symbol->type_qualifier == UNIFORM) {
printf("DCONST(%s) ", symbol->identifier);
/*
if (symbol->type_specifier == SCALAR)
printf("DCONST_REAL(AC_%s) ", symbol->identifier);
else if (symbol->type_specifier == INT)
printf("DCONST_INT(AC_%s) ", symbol->identifier);
else
printf("INVALID UNIFORM type specifier %s with %s\n",
translate(symbol->type_specifier), symbol->identifier);
*/
if (inside_kernel && symbol->type_specifier == SCALARARRAY) {
printf("buffer.profiles[%s] ", symbol->identifier);
}
else {
printf("DCONST(%s) ", symbol->identifier);
}
}
else {
// Do a regular translation
@@ -613,6 +608,15 @@ generate_header(void)
}
printf("\n\n");
// Scalar arrays
printf("#define AC_FOR_SCALARARRAY_HANDLES(FUNC)");
for (int i = 0; i < num_symbols; ++i) {
if (symbol_table[i].type_specifier == SCALARARRAY) {
printf("\\\nFUNC(%s),", symbol_table[i].identifier);
}
}
printf("\n\n");
/*
printf("\n");
printf("typedef struct {\n");

View File

@@ -45,23 +45,394 @@ The foundational work was done in (Väisälä, Pekkilä, 2017) and the library,
# Astaroth API
## Devices and nodes
The Astroth application-programming interface (API) provides the means for controlling execution of user-defined and built-in functions on multiple graphics processing units. Functions in the API are prefixed with lower case ```ac```, while structures and data types are prefixed with capitalized ```Ac```. Compile-time constants, such as definitions and enumerations, have the prefix ```AC_```. All of the API functions return an AcResult value indicating either success or failure. The return codes are
```C
typedef enum {
AC_SUCCESS = 0,
AC_FAILURE = 1
} AcResult;
```
The API is divided into layers which differ in the level of control provided over the execution. There are two primary layers:
* Device layer
> Functions start with acDevice*.
> Provides control over a single GPU.
> All functions are asynchronous.
* Node layer
> Functions start with acNode*.
> Provides control over multiple devices in a single node.
> All functions are asynchronous and executed concurrently on all devices in the node.
> Subsequent functions called in the same stream (see Section #Streams and synchronization) are guaranteed to be synchronous.
Finally, a third layer is provided for convenience and backwards compatibility.
* Astaroth layer (deprecated)
> Functions start with ac* without Node or Device, f.ex. acInit().
> Provided for backwards compatibility.
> Essentially a wrapper for the Node layer.
> All functions are guaranteed to be synchronous.
There are also several helper functions defined in `include/astaroth_defines.h`, which can be used for, say, determining the size or performing index calculations within the simulation domain.
## List of Astaroth API functions
Here's a non-exhaustive list of astaroth API functions. For more info and an up-to-date list, see the corresponding header files in `include/astaroth_defines.h`, `include/astaroth.h`, `include/astaroth_node.h`, `include/astaroth_device.h`.
### Initialization, quitting and helper functions
Device layer.
```C
AcResult acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device);
AcResult acDeviceDestroy(Device device);
AcResult acDevicePrintInfo(const Device device);
AcResult acDeviceAutoOptimize(const Device device);
```
Node layer.
```C
AcResult acNodeCreate(const int id, const AcMeshInfo node_config, Node* node);
AcResult acNodeDestroy(Node node);
AcResult acNodePrintInfo(const Node node);
AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config);
AcResult acNodeAutoOptimize(const Node node);
```
General helper functions.
```C
size_t acVertexBufferSize(const AcMeshInfo info);
size_t acVertexBufferSizeBytes(const AcMeshInfo info);
size_t acVertexBufferCompdomainSize(const AcMeshInfo info);
size_t acVertexBufferCompdomainSizeBytes(const AcMeshInfo info);
size_t acVertexBufferIdx(const int i, const int j, const int k, const AcMeshInfo info);
```
### Loading and storing
Loading meshes and vertex buffers to device memory.
```C
AcResult acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh);
AcResult acDeviceLoadMeshWithOffset(const Device device, const Stream stream,
const AcMesh host_mesh, const int3 src, const int3 dst,
const int num_vertices);
AcResult acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle);
AcResult acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream,
const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices);
AcResult acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh);
AcResult acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
const int3 src, const int3 dst, const int num_vertices);
AcResult acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle);
AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream,
const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices);
```
Storing meshes and vertex buffer to host memory.
```C
AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh);
AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
const int3 dst, const int num_vertices, AcMesh* host_mesh);
AcResult acDeviceStoreVertexBuffer(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
const int3 dst, const int num_vertices, AcMesh* host_mesh);
AcResult acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh);
AcResult acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src,
const int3 dst, const int num_vertices, AcMesh* host_mesh);
AcResult acNodeStoreVertexBuffer(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices,
AcMesh* host_mesh);
```
Transferring data between devices
```C
AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device);
AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream,
const int3 src, const int3 dst, const int num_vertices,
Device* dst_device);
AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, Device dst_device);
AcResult acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
const VertexBufferHandle vtxbuf_handle,
const int3 src, const int3 dst,
const int num_vertices, Device dst_device);
```
Loading uniforms (device constants)
```C
AcResult acDeviceLoadScalarConstant(const Device device, const Stream stream,
const AcRealParam param, const AcReal value);
AcResult acDeviceLoadVectorConstant(const Device device, const Stream stream,
const AcReal3Param param, const AcReal3 value);
AcResult acDeviceLoadIntConstant(const Device device, const Stream stream, const AcIntParam param,
const int value);
AcResult acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3Param param,
const int3 value);
AcResult acDeviceLoadScalarArray(const Device device, const Stream stream,
const ScalarArrayHandle handle, const AcReal* data,
const size_t num);
AcResult acDeviceLoadMeshInfo(const Device device, const Stream stream,
const AcMeshInfo device_config);
```
### Computation
```C
AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
const int3 start, const int3 end, const AcReal dt);
AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 start,
const int3 end);
AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
const int3 end);
AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result);
AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const ReductionType rtype,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2, AcReal* result);
AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
const int3 start, const int3 end, const AcReal dt);
AcResult acNodeIntegrate(const Node node, const AcReal dt);
AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle);
AcResult acNodePeriodicBoundconds(const Node node, const Stream stream);
AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result);
AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2, AcReal* result);
```
### Stream synchronization
All library functions that take a `Stream` as a parameter are asynchronous. When calling these functions,
the control returns immediately back to the host even if the called device function has not yet completed.
Therefore special care must be taken in order to ensure proper synchronization.
Synchronization is done using `Stream` primitives, defined as
```C
typedef enum { STREAM_DEFAULT, STREAM_0, ..., STREAM_16, NUM_STREAMS } Stream;
#define STREAM_ALL (NUM_STREAMS)
```
Functions queued in the same stream will be executed sequentially. If two or more consequent
functions are queued in different streams, then these functions may execute in parallel. Finally,
there is a barrier synchronization function, which blocks until all functions in some stream have
completed. The Astaroth API provides barrier synchronization with functions `acDeviceSynchronize` and
`acNodeSynchronize`. All streams can be synchronized at once by passing the alias `STREAM_ALL` to
the synchronization function.
Usage of streams is demonstrated with the following example.
```C
funcA(STREAM_0);
funcB(STREAM_0); // Blocks until funcA has completed
funcC(STREAM_1); // May execute in parallel with funcB
synchronizeStream(STREAM_ALL); // Blocks until functions in all streams have completed
funcD(STREAM_2); // Is started when command returns from synchronizeStream()
```
### Data synchronization
Stream synchronization works in the same fashion on node and device layers. However, on the node
layer one has to take in account that a portion of the mesh is shared between devices and that the
data is always up to date.
In stencil computations, the mesh is surrounded by a halo, where data is only used for updating grid
points near the boundaries of the simulation domain. A portion of this halo is shared by neighboring
devices. As there is no way of knowing when the user has completed operations on the mesh, the data
communication between neighboring devices must be explicitly triggered. For this purpose, we provide
the functions
```C
AcResult acNodeSynchronizeMesh(const Node node, const Stream stream);
AcResult acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle);
```
> **NOTE**: Local halos must be up to date before synchronizing the data. Local halos are the grid points outside the computational domain which are used only by a single device. The mesh is distributed to multiple devices by blocking along the z axis. If there are *n* devices and the z-dimension of the computational domain is *nz*, then each device is assigned *nz / n* two-dimensional planes. For example with two devices, the data block that has to be up to date ranges from *(0, 0, nz)* to *(mx, my, nz + 2 * NGHOST)*
### Input and output buffers
The mesh is duplicated to input and output buffers for performance reasons. The input buffers are
read-only in user-specified compute kernels, which allows us to read them via the texture cache optimized
for spatially local memory accesses. The results of compute kernels are written into the output buffers.
Since we allow the user to operate on subsets of the computational domain in user-specified kernels, we have no way to know when the output buffers are complete and can be swapped. Therefore the user should explicitly state when the input and output buffer should be swapped. This is done via the API calls
```C
AcResult acDeviceSwapBuffers(const Device device);
AcResult acNodeSwapBuffers(const Node node);
```
> **NOTE**: All functions provided with the API operate on input buffers and ensure that the complete result is available in the input buffer when the function has completed. User-specified kernels are exceptions and write the result to output buffers. Therefore buffers have to be swapped only after calling user-specified kernels.
## Devices
`Device` is a handle to some single device and is used in device layer functions to specify which device should execute the function. A `Device` is created and destroyed with the following interface functions.
```C
AcResult acDeviceCreate(const int device_id, const AcMeshInfo device_config, Device* device);
AcResult acDeviceDestroy(Device device);
```
## Nodes
`Node` is a handle to some compute node which consists of multiple devices. The `Node` handle is used to specify which node the node layer functions should operate in. A node is created and destroyed with the following interface functions.
```C
AcResult acNodeCreate(const int id, const AcMeshInfo node_config, Node* node);
AcResult acNodeDestroy(Node node);
```
The function acNodeCreate calls acDeviceCreate for all devices that are visible from the current process. After a node has been created, the devices in it can be retrived with the function
```C
AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config);
```
where DeviceConfiguration is defined as
```C
typedef struct {
int num_devices;
Device* devices; // Todo make this predefined s.t. the user/us do not have to alloc/free
Grid grid;
Grid subgrid;
} DeviceConfiguration;
```
## Meshes
## Streams and synchronization
Meshes are the primary structures for passing information to the library and kernels. The definition
of a `Mesh` is declared as
```C
typedef struct {
int int_params[NUM_INT_PARAMS];
int3 int3_params[NUM_INT3_PARAMS];
AcReal real_params[NUM_REAL_PARAMS];
AcReal3 real3_params[NUM_REAL3_PARAMS];
} AcMeshInfo;
## Interface
typedef struct {
AcReal* vertex_buffer[NUM_VTXBUF_HANDLES];
AcMeshInfo info;
} AcMesh;
```
Several built-in parameters, such as the dimensions of the mesh, and all user-defined parameters
are defined in the `AcMeshInfo` structure. Before passing AcMeshInfo to an initialization function,
such as `acDeviceCreate()`, the built-in parameters `AC_nx, AC_ny, AC_nz` must be set. These
parameters define the dimensions of the computational domain of the mesh. For example,
```C
AcMeshInfo info;
info.int_params[AC_nx] = 128;
info.int_params[AC_ny] = 64;
info.int_params[AC_nz] = 32;
Device device;
acDeviceCreate(0, info, &device);
```
AcMesh is used in loading and storing data from host to device and vice versa. Before calling for
example `acDeviceLoadMesh()`, one must ensure that all `NUM_VTXBUF_HANDLES` are pointers to valid
arrays in host memory consisting of `mx * my * mz` elements and stored in order
`i + j * mx + k * mx * my`, where `i`, `j` and `k` correspond to `x`, `y` and `z` coordinate
indices, respectively. The mesh dimensions can be queried with
```C
int mx = info.int_params[AC_mx];
int my = info.int_params[AC_my];
int mz = info.int_params[AC_mz];
```
after initialization.
### Decomposition
Grids and subgrids contain the dimensions of the the mesh decomposed to multiple devices.
```C
typedef struct {
int3 m; // Size of the simulation domain (includes the ghost zones)
int3 n; // Size of the computational domain (without ghost zones)
} Grid;
```
As briefly discussed in the section Data synchronization, a `Mesh` is distributed to multiple devices
by blocking the data along the *z*-axis. Given the mesh dimensions *(mx, my, mz)*, its computational
domain *(nx, ny, nz)* and *n* number of devices, then each device is assigned a mesh of size
*(mx, my, 2 * NGHOST + nz/n)* and a computational domain of size *(nx, ny, nz/n)*.
Let *i* be the device id. The portion of the halos shared by neighboring devices is then *(0, 0, i * nz/n)* - *(mx, my, 2 * NGHOST + i * nz/n)*. The functions `acNodeSynchronizeVertexBuffer` and `acNodeSynchronizeMesh` communicate these shared areas among the devices in the node.
## Integration, reductions and boundary conditions
The library provides the following functions for integration, reductions and computing periodic boundary conditions.
```C
AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
const int3 start, const int3 end, const AcReal dt);
AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 start,
const int3 end);
AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
const int3 end);
AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result);
AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const ReductionType rtype,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2, AcReal* result);
AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
const int3 start, const int3 end, const AcReal dt);
AcResult acNodeIntegrate(const Node node, const AcReal dt);
AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle);
AcResult acNodePeriodicBoundconds(const Node node, const Stream stream);
AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result);
AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2, AcReal* result);
```
Finally, there's a library function that is automatically generated for all user-specified `Kernel`
functions written with the Astaroth DSL,
```C
AcResult acDeviceKernel_##identifier(const Device device, const Stream stream,
const int3 start, const int3 end);
```
Where `##identifier` is replaced with the name of the user-specified kernel. For example, a device
function `Kernel solve()` can be called with `acDeviceKernel_solve()` via the API.
# Astaroth DSL
## Uniforms
### Control flow and implementing switches
// Runtime constants are as fast as compile-time constants as long as
// 1) They are not placed in tight loops, especially those that inlcude global memory accesses, that could be unrolled
// 2) They are not multiplied with each other
// 3) At least 32 neighboring threads in the x-axis access the same constant
// Safe and efficient to use as switches
## Vertex buffers
### Input and output buffers
## Built-in variables and functions
## Functions
### Kernel
### Preprocessed

View File

@@ -127,9 +127,9 @@ typedef enum {
STREAM_14,
STREAM_15,
STREAM_16,
NUM_STREAM_TYPES
NUM_STREAMS
} Stream;
#define STREAM_ALL (NUM_STREAM_TYPES)
#define STREAM_ALL (NUM_STREAMS)
#define AC_GEN_ID(X) X
typedef enum {
@@ -156,6 +156,11 @@ typedef enum {
NUM_REAL3_PARAMS
} AcReal3Param;
typedef enum {
AC_FOR_SCALARARRAY_HANDLES(AC_GEN_ID) //
NUM_SCALARARRAY_HANDLES
} ScalarArrayHandle;
typedef enum {
AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) //
NUM_VTXBUF_HANDLES
@@ -166,6 +171,7 @@ extern const char* intparam_names[];
extern const char* int3param_names[];
extern const char* realparam_names[];
extern const char* real3param_names[];
extern const char* scalararray_names[];
extern const char* vtxbuf_names[];
typedef struct {

View File

@@ -61,6 +61,11 @@ AcResult acDeviceLoadIntConstant(const Device device, const Stream stream, const
AcResult acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3Param param,
const int3 value);
/** */
AcResult acDeviceLoadScalarArray(const Device device, const Stream stream,
const ScalarArrayHandle handle, const size_t start,
const AcReal* data, const size_t num);
/** */
AcResult acDeviceLoadMeshInfo(const Device device, const Stream stream,
const AcMeshInfo device_config);

View File

@@ -5,6 +5,7 @@ then
echo "ASTAROTH_HOME environment variable not set, run \"source ./sourceme.sh\" in Astaroth home directory"
exit 1
fi
OUTPUT_DIR=${PWD}
KERNEL_DIR=${AC_HOME}"/src/core/kernels"
ACC_DIR=${AC_HOME}"/acc"
@@ -25,11 +26,19 @@ while [ "$#" -gt 0 ]
do
case $1 in
-h|--help)
echo "You can set a custom files for DSL under the path $AC_HOME/"
echo "This script will help to compile DSL to CUDA code."
echo "The resulting kernels will be stored to $OUTPUT_DIR."
echo "You can set a custom files for DSL under the path $AC_DIR/"
echo "Example:"
echo "compile_acc.sh -a custom_setup/custom_assembly.sas -p custom_setup/custom_process.sps --header custom_setup/custom_header.h"
exit 0
;;
-I|--include)
shift
ACC_INCLUDE_DIR=${1}
shift
echo "CUSTOM include dir!"
;;
--header)
shift
ACC_HEADER=${1}
@@ -63,11 +72,11 @@ ${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SAS}
${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SPS}
${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_HEADER}
echo "Moving stencil_assembly.cuh -> ${AC_HOME}/src/core/kernels"
mv stencil_assembly.cuh ${AC_HOME}/src/core/kernels
echo "Moving stencil_assembly.cuh -> ${OUTPUT_DIR}"
mv stencil_assembly.cuh ${OUTPUT_DIR}
echo "Moving stencil_process.cuh -> ${AC_HOME}/src/core/kernels"
mv stencil_process.cuh ${AC_HOME}/src/core/kernels
echo "Moving stencil_process.cuh -> ${OUTPUT_DIR}"
mv stencil_process.cuh ${OUTPUT_DIR}
echo "Moving stencil_defines.cuh -> ${AC_HOME}/include"
mv stencil_defines.h ${AC_HOME}/include
echo "Moving stencil_defines.cuh -> ${OUTPUT_DIR}"
mv stencil_defines.h ${OUTPUT_DIR}

View File

@@ -22,15 +22,16 @@
#include "math_utils.h" // int3 + int3
#define AC_GEN_STR(X) #X
const char* intparam_names[] = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
const char* intparam_names[] = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
const char* int3param_names[] = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
const char* int3param_names[] = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
const char* realparam_names[] = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
const char* realparam_names[] = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
const char* scalararray_names[] = {AC_FOR_SCALARARRAY_HANDLES(AC_GEN_STR)};
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
#undef AC_GEN_STR
static const int num_nodes = 1;

View File

@@ -37,6 +37,8 @@
typedef struct {
AcReal* in[NUM_VTXBUF_HANDLES];
AcReal* out[NUM_VTXBUF_HANDLES];
AcReal* profiles[NUM_SCALARARRAY_HANDLES];
} VertexBufferArray;
struct device_s {
@@ -44,7 +46,7 @@ struct device_s {
AcMeshInfo local_config;
// Concurrency
cudaStream_t streams[NUM_STREAM_TYPES];
cudaStream_t streams[NUM_STREAMS];
// Memory
VertexBufferArray vba;
@@ -97,6 +99,32 @@ DCONST(const VertexBufferHandle handle)
//#define globalMeshN_min // Placeholder
#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
//#include <thrust/complex.h>
// using namespace thrust;
#include <cuComplex.h>
#if AC_DOUBLE_PRECISION == 1
typedef cuDoubleComplex acComplex;
#define acComplex(x, y) make_cuDoubleComplex(x, y)
#else
typedef cuFloatComplex acComplex;
#define acComplex(x, y) make_cuFloatComplex(x, y)
#endif
static __device__ inline acComplex
exp(const acComplex& val)
{
return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
}
static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
{
return (acComplex){a * b.x, a * b.y};
}
static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
{
return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
}
//#include <complex>
#include "kernels/boundconds.cuh"
#include "kernels/integration.cuh"
#include "kernels/reductions.cuh"
@@ -135,16 +163,26 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
printf("Success!\n");
// Concurrency
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
for (int i = 0; i < NUM_STREAMS; ++i) {
cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
}
// Memory
// VBA in/out
const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
}
// VBA Profiles
const size_t profile_size_bytes = sizeof(AcReal) * max(device_config.int_params[AC_mx],
max(device_config.int_params[AC_my],
device_config.int_params[AC_mz]));
for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.profiles[i], profile_size_bytes));
}
// Reductions
ERRCHK_CUDA_ALWAYS(
cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
@@ -178,6 +216,10 @@ acDeviceDestroy(Device device)
cudaFree(device->vba.in[i]);
cudaFree(device->vba.out[i]);
}
for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
cudaFree(device->vba.profiles[i]);
}
cudaFree(device->reduce_scratchpad);
cudaFree(device->reduce_result);
@@ -186,7 +228,7 @@ acDeviceDestroy(Device device)
#endif
// Concurrency
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
for (int i = 0; i < NUM_STREAMS; ++i) {
cudaStreamDestroy(device->streams[i]);
}
@@ -405,6 +447,21 @@ acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3P
return AC_SUCCESS;
}
AcResult
acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarArrayHandle handle,
const size_t start, const AcReal* data, const size_t num)
{
cudaSetDevice(device->id);
ERRCHK(start + num <= max(device->local_config.int_params[AC_mx],
max(device->local_config.int_params[AC_my],
device->local_config.int_params[AC_mz])));
ERRCHK_CUDA(cudaMemcpyAsync(&device->vba.profiles[handle][start], data, sizeof(data[0]) * num,
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
{

View File

@@ -70,11 +70,11 @@ create_rotz(const AcReal radians)
#define cos __cosf
#define exp __expf
*/
#define sin sinf
#define cos cosf
#define exp expf
#define rsqrt rsqrtf // hardware reciprocal sqrt
#endif // AC_DOUBLE_PRECISION == 0
//#define sin sinf
//#define cos cosf
//#define exp expf
//#define rsqrt rsqrtf // hardware reciprocal sqrt
#endif // AC_DOUBLE_PRECISION == 0
/*
* =============================================================================

View File

@@ -124,6 +124,11 @@ static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
return (AcReal3){a * b.x, a * b.y, a * b.z};
}
static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
{
return (AcReal3){a * b.x, a * b.y, a * b.z};
}
static HOST_DEVICE_INLINE AcReal
dot(const AcReal3& a, const AcReal3& b)
{