Merge branch 'master' into sink_20190723

2019-09-16 10:57:15 +08:00
parent 88a8198810 55e4357d77
commit 42f92c7d49
16 changed files with 1100 additions and 40 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,6 +50,7 @@ endif ()
 ## Include directories
 include_directories(.)
 include_directories(include)
+include_directories(${CMAKE_BINARY_DIR})

 ## Subdirectories
 add_subdirectory(src/core) # The core library
--- a/acc/pc_mhd_solver/.gitignore
+++ b/acc/pc_mhd_solver/.gitignore
@@ -0,0 +1,5 @@
+build
+testbin
+
+# Except this file
+!.gitignore
--- a/acc/pc_mhd_solver/stencil_assembly.sas
+++ b/acc/pc_mhd_solver/stencil_assembly.sas
@@ -0,0 +1,75 @@
+#include "stencil_definition.sdh"
+
+Preprocessed Scalar
+value(in ScalarField vertex)
+{
+    return vertex[vertexIdx];
+}
+
+Preprocessed Vector
+gradient(in ScalarField vertex)
+{
+    return (Vector){derx(vertexIdx, vertex), dery(vertexIdx, vertex), derz(vertexIdx, vertex)};
+}
+
+#if LUPWD
+
+Preprocessed Scalar
+der6x_upwd(in ScalarField vertex)
+{
+    Scalar inv_ds = AC_inv_dsx;
+
+    return (Scalar){Scalar(1.0 / 60.0) * inv_ds *
+                    (-Scalar(20.0) * vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z] +
+                     Scalar(15.0) * (vertex[vertexIdx.x + 1, vertexIdx.y, vertexIdx.z] +
+                                     vertex[vertexIdx.x - 1, vertexIdx.y, vertexIdx.z]) -
+                     Scalar(6.0) * (vertex[vertexIdx.x + 2, vertexIdx.y, vertexIdx.z] +
+                                    vertex[vertexIdx.x - 2, vertexIdx.y, vertexIdx.z]) +
+                     vertex[vertexIdx.x + 3, vertexIdx.y, vertexIdx.z] +
+                     vertex[vertexIdx.x - 3, vertexIdx.y, vertexIdx.z])};
+}
+
+Preprocessed Scalar
+der6y_upwd(in ScalarField vertex)
+{
+    Scalar inv_ds = AC_inv_dsy;
+
+    return (Scalar){Scalar(1.0 / 60.0) * inv_ds *
+                    (-Scalar(20.0) * vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z] +
+                     Scalar(15.0) * (vertex[vertexIdx.x, vertexIdx.y + 1, vertexIdx.z] +
+                                     vertex[vertexIdx.x, vertexIdx.y - 1, vertexIdx.z]) -
+                     Scalar(6.0) * (vertex[vertexIdx.x, vertexIdx.y + 2, vertexIdx.z] +
+                                    vertex[vertexIdx.x, vertexIdx.y - 2, vertexIdx.z]) +
+                     vertex[vertexIdx.x, vertexIdx.y + 3, vertexIdx.z] +
+                     vertex[vertexIdx.x, vertexIdx.y - 3, vertexIdx.z])};
+}
+
+Preprocessed Scalar
+der6z_upwd(in ScalarField vertex)
+{
+    Scalar inv_ds = AC_inv_dsz;
+
+    return (Scalar){Scalar(1.0 / 60.0) * inv_ds *
+                    (-Scalar(20.0) * vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z] +
+                     Scalar(15.0) * (vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z + 1] +
+                                     vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z - 1]) -
+                     Scalar(6.0) * (vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z + 2] +
+                                    vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z - 2]) +
+                     vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z + 3] +
+                     vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z - 3])};
+}
+
+#endif
+
+Preprocessed Matrix
+hessian(in ScalarField vertex)
+{
+    Matrix hessian;
+
+    hessian.row[0] = (Vector){derxx(vertexIdx, vertex), derxy(vertexIdx, vertex),
+                              derxz(vertexIdx, vertex)};
+    hessian.row[1] = (Vector){hessian.row[0].y, deryy(vertexIdx, vertex), deryz(vertexIdx, vertex)};
+    hessian.row[2] = (Vector){hessian.row[0].z, hessian.row[1].z, derzz(vertexIdx, vertex)};
+
+    return hessian;
+}
--- a/acc/pc_mhd_solver/stencil_definition.sdh
+++ b/acc/pc_mhd_solver/stencil_definition.sdh
@@ -0,0 +1,159 @@
+#define LDENSITY (1)
+#define LHYDRO (1)
+#define LMAGNETIC (1)
+#define LENTROPY (1)
+#define LTEMPERATURE (0)
+#define LFORCING (1)
+#define LUPWD (1)
+
+#define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
+
+// Int params
+uniform int AC_max_steps;
+uniform int AC_save_steps;
+uniform int AC_bin_steps;
+uniform int AC_bc_type;
+
+// Real params
+uniform Scalar AC_dt;
+// Spacing
+uniform Scalar AC_dsx;
+uniform Scalar AC_dsy;
+uniform Scalar AC_dsz;
+uniform Scalar AC_dsmin;
+// physical grid
+uniform Scalar AC_xlen;
+uniform Scalar AC_ylen;
+uniform Scalar AC_zlen;
+uniform Scalar AC_xorig;
+uniform Scalar AC_yorig;
+uniform Scalar AC_zorig;
+// Physical units
+uniform Scalar AC_unit_density;
+uniform Scalar AC_unit_velocity;
+uniform Scalar AC_unit_length;
+// properties of gravitating star
+uniform Scalar AC_star_pos_x;
+uniform Scalar AC_star_pos_y;
+uniform Scalar AC_star_pos_z;
+uniform Scalar AC_M_star;
+//  Run params
+uniform Scalar AC_cdt;
+uniform Scalar AC_cdtv;
+uniform Scalar AC_cdts;
+uniform Scalar AC_nu_visc;
+uniform Scalar AC_cs_sound;
+uniform Scalar AC_eta;
+uniform Scalar AC_mu0;
+uniform Scalar AC_cp_sound;
+uniform Scalar AC_gamma;
+uniform Scalar AC_cv_sound;
+uniform Scalar AC_lnT0;
+uniform Scalar AC_lnrho0;
+uniform Scalar AC_zeta;
+uniform Scalar AC_trans;
+//  Other
+uniform Scalar AC_bin_save_t;
+//  Initial condition params
+uniform Scalar AC_ampl_lnrho;
+uniform Scalar AC_ampl_uu;
+uniform Scalar AC_angl_uu;
+uniform Scalar AC_lnrho_edge;
+uniform Scalar AC_lnrho_out;
+//  Forcing parameters. User configured.
+uniform Scalar AC_forcing_magnitude;
+uniform Scalar AC_relhel;
+uniform Scalar AC_kmin;
+uniform Scalar AC_kmax;
+//  Forcing parameters. Set by the generator.
+uniform Scalar AC_forcing_phase;
+uniform Scalar AC_k_forcex;
+uniform Scalar AC_k_forcey;
+uniform Scalar AC_k_forcez;
+uniform Scalar AC_kaver;
+uniform Scalar AC_ff_hel_rex;
+uniform Scalar AC_ff_hel_rey;
+uniform Scalar AC_ff_hel_rez;
+uniform Scalar AC_ff_hel_imx;
+uniform Scalar AC_ff_hel_imy;
+uniform Scalar AC_ff_hel_imz;
+//  Additional helper params  //  (deduced from other params do not set these directly!)
+uniform Scalar AC_G_CONST;
+uniform Scalar AC_GM_star;
+uniform Scalar AC_sq2GM_star;
+uniform Scalar AC_cs2_sound;
+uniform Scalar AC_inv_dsx;
+uniform Scalar AC_inv_dsy;
+uniform Scalar AC_inv_dsz;
+
+// PC-style helical forcing with profiles
+uniform Vector AC_kk;
+uniform Vector AC_coef1;
+uniform Vector AC_coef2;
+uniform Vector AC_coef3;
+uniform Vector AC_fda;
+
+uniform Scalar AC_phase;
+uniform Scalar AC_fact;
+uniform Scalar AC_k1_ff;
+
+uniform ScalarArray AC_profx;
+uniform ScalarArray AC_profy;
+uniform ScalarArray AC_profz;
+uniform ScalarArray AC_profx_hel;
+uniform ScalarArray AC_profy_hel;
+uniform ScalarArray AC_profz_hel;
+
+uniform int AC_iforcing_zsym;
+
+uniform ScalarArray AC_0;
+uniform ScalarArray AC_1;
+uniform ScalarArray AC_2;
+uniform ScalarArray AC_3;
+uniform ScalarArray AC_4;
+uniform ScalarArray AC_5;
+uniform ScalarArray AC_6;
+uniform ScalarArray AC_7;
+uniform ScalarArray AC_8;
+uniform ScalarArray AC_9;
+uniform ScalarArray AC_10;
+uniform ScalarArray AC_11;
+uniform ScalarArray AC_12;
+uniform ScalarArray AC_13;
+uniform ScalarArray AC_14;
+uniform ScalarArray AC_15;
+uniform ScalarArray AC_16;
+uniform ScalarArray AC_17;
+uniform ScalarArray AC_18;
+uniform ScalarArray AC_19;
+
+/*
+ * =============================================================================
+ * User-defined vertex buffers
+ * =============================================================================
+ */
+#if LENTROPY
+uniform ScalarField VTXBUF_LNRHO;
+uniform ScalarField VTXBUF_UUX;
+uniform ScalarField VTXBUF_UUY;
+uniform ScalarField VTXBUF_UUZ;
+uniform ScalarField VTXBUF_AX;
+uniform ScalarField VTXBUF_AY;
+uniform ScalarField VTXBUF_AZ;
+uniform ScalarField VTXBUF_ENTROPY;
+#elif LMAGNETIC
+uniform ScalarField VTXBUF_LNRHO;
+uniform ScalarField VTXBUF_UUX;
+uniform ScalarField VTXBUF_UUY;
+uniform ScalarField VTXBUF_UUZ;
+uniform ScalarField VTXBUF_AX;
+uniform ScalarField VTXBUF_AY;
+uniform ScalarField VTXBUF_AZ;
+#elif LHYDRO
+uniform ScalarField VTXBUF_LNRHO;
+uniform ScalarField VTXBUF_UUX;
+uniform ScalarField VTXBUF_UUY;
+uniform ScalarField VTXBUF_UUZ;
+#else
+uniform ScalarField VTXBUF_LNRHO;
+#endif
--- a/acc/pc_mhd_solver/stencil_process.sps
+++ b/acc/pc_mhd_solver/stencil_process.sps
@@ -0,0 +1,358 @@
+#include "stencil_definition.sdh"
+
+Vector
+value(in VectorField uu)
+{
+    return (Vector){value(uu.x), value(uu.y), value(uu.z)};
+}
+
+#if LUPWD
+Scalar
+upwd_der6(in VectorField uu, in ScalarField lnrho)
+{
+    Scalar uux = fabs(value(uu).x);
+    Scalar uuy = fabs(value(uu).y);
+    Scalar uuz = fabs(value(uu).z);
+    return (Scalar){uux * der6x_upwd(lnrho) + uuy * der6y_upwd(lnrho) + uuz * der6z_upwd(lnrho)};
+}
+#endif
+
+Matrix
+gradients(in VectorField uu)
+{
+    return (Matrix){gradient(uu.x), gradient(uu.y), gradient(uu.z)};
+}
+
+Scalar
+continuity(in VectorField uu, in ScalarField lnrho)
+{
+    return -dot(value(uu), gradient(lnrho))
+#if LUPWD
+           // This is a corrective hyperdiffusion term for upwinding.
+           + upwd_der6(uu, lnrho)
+#endif
+           - divergence(uu);
+}
+
+#if LENTROPY
+Vector
+momentum(in VectorField uu, in ScalarField lnrho, in ScalarField ss, in VectorField aa)
+{
+    const Matrix S   = stress_tensor(uu);
+    const Scalar cs2 = AC_cs2_sound * exp(AC_gamma * value(ss) / AC_cp_sound +
+                                          (AC_gamma - 1) * (value(lnrho) - AC_lnrho0));
+    const Vector j   = (Scalar(1.) / AC_mu0) *
+                     (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
+    const Vector B = curl(aa);
+    // TODO: DOES INTHERMAL VERSTION INCLUDE THE MAGNETIC FIELD?
+    const Scalar inv_rho = Scalar(1.) / exp(value(lnrho));
+
+    // Regex replace CPU constants with get\(AC_([a-zA-Z_0-9]*)\)
+    // \1
+    const Vector mom = -mul(gradients(uu), value(uu)) -
+                       cs2 * ((Scalar(1.) / AC_cp_sound) * gradient(ss) + gradient(lnrho)) +
+                       inv_rho * cross(j, B) +
+                       AC_nu_visc *
+                           (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+                            Scalar(2.) * mul(S, gradient(lnrho))) +
+                       AC_zeta * gradient_of_divergence(uu);
+    return mom;
+}
+#elif LTEMPERATURE
+Vector
+momentum(in VectorField uu, in ScalarField lnrho, in ScalarField tt)
+{
+    Vector mom;
+
+    const Matrix S = stress_tensor(uu);
+
+    const Vector pressure_term = (AC_cp_sound - AC_cv_sound) *
+                                 (gradient(tt) + value(tt) * gradient(lnrho));
+
+    mom = -mul(gradients(uu), value(uu)) - pressure_term +
+          AC_nu_visc * (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+                        Scalar(2.) * mul(S, gradient(lnrho))) +
+          AC_zeta * gradient_of_divergence(uu);
+
+#if LGRAVITY
+    mom = mom - (Vector){0, 0, -10.0};
+#endif
+
+    return mom;
+}
+#else
+Vector
+momentum(in VectorField uu, in ScalarField lnrho)
+{
+    Vector mom;
+
+    const Matrix S = stress_tensor(uu);
+
+    // Isothermal: we have constant speed of sound
+
+    mom = -mul(gradients(uu), value(uu)) - AC_cs2_sound * gradient(lnrho) +
+          AC_nu_visc * (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+                        Scalar(2.) * mul(S, gradient(lnrho))) +
+          AC_zeta * gradient_of_divergence(uu);
+
+#if LGRAVITY
+    mom = mom - (Vector){0, 0, -10.0};
+#endif
+
+    return mom;
+}
+#endif
+
+Vector
+induction(in VectorField uu, in VectorField aa)
+{
+    // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
+    // x A)) in order to avoid taking the first derivative twice (did the math,
+    // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
+    // u cross B - AC_eta * AC_mu0 * (AC_mu0^-1 * [- laplace A + grad div A ])
+    const Vector B        = curl(aa);
+    const Vector grad_div = gradient_of_divergence(aa);
+    const Vector lap      = laplace_vec(aa);
+
+    // Note, AC_mu0 is cancelled out
+    const Vector ind = cross(value(uu), B) - AC_eta * (grad_div - lap);
+
+    return ind;
+}
+
+#if LENTROPY
+Scalar
+lnT(in ScalarField ss, in ScalarField lnrho)
+{
+    const Scalar lnT = AC_lnT0 + AC_gamma * value(ss) / AC_cp_sound +
+                       (AC_gamma - Scalar(1.)) * (value(lnrho) - AC_lnrho0);
+    return lnT;
+}
+
+// Nabla dot (K nabla T) / (rho T)
+Scalar
+heat_conduction(in ScalarField ss, in ScalarField lnrho)
+{
+    const Scalar inv_AC_cp_sound = AcReal(1.) / AC_cp_sound;
+
+    const Vector grad_ln_chi = -gradient(lnrho);
+
+    const Scalar first_term = AC_gamma * inv_AC_cp_sound * laplace(ss) +
+                              (AC_gamma - AcReal(1.)) * laplace(lnrho);
+    const Vector second_term = AC_gamma * inv_AC_cp_sound * gradient(ss) +
+                               (AC_gamma - AcReal(1.)) * gradient(lnrho);
+    const Vector third_term = AC_gamma * (inv_AC_cp_sound * gradient(ss) + gradient(lnrho)) +
+                              grad_ln_chi;
+
+    const Scalar chi = AC_THERMAL_CONDUCTIVITY / (exp(value(lnrho)) * AC_cp_sound);
+    return AC_cp_sound * chi * (first_term + dot(second_term, third_term));
+}
+
+Scalar
+heating(const int i, const int j, const int k)
+{
+    return 1;
+}
+
+Scalar
+entropy(in ScalarField ss, in VectorField uu, in ScalarField lnrho, in VectorField aa)
+{
+    const Matrix S      = stress_tensor(uu);
+    const Scalar inv_pT = Scalar(1.) / (exp(value(lnrho)) * exp(lnT(ss, lnrho)));
+    const Vector j      = (Scalar(1.) / AC_mu0) *
+                     (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
+    const Scalar RHS = H_CONST - C_CONST + AC_eta * (AC_mu0)*dot(j, j) +
+                       Scalar(2.) * exp(value(lnrho)) * AC_nu_visc * contract(S) +
+                       AC_zeta * exp(value(lnrho)) * divergence(uu) * divergence(uu);
+
+    return -dot(value(uu), gradient(ss)) + inv_pT * RHS + heat_conduction(ss, lnrho);
+}
+#endif
+
+#if LTEMPERATURE
+Scalar
+heat_transfer(in VectorField uu, in ScalarField lnrho, in ScalarField tt)
+{
+    const Matrix S                  = stress_tensor(uu);
+    const Scalar heat_diffusivity_k = 0.0008; // 8e-4;
+    return -dot(value(uu), gradient(tt)) + heat_diffusivity_k * laplace(tt) +
+           heat_diffusivity_k * dot(gradient(lnrho), gradient(tt)) +
+           AC_nu_visc * contract(S) * (Scalar(1.) / AC_cv_sound) -
+           (AC_gamma - 1) * value(tt) * divergence(uu);
+}
+#endif
+
+#if LFORCING
+Vector
+simple_vortex_forcing(Vector a, Vector b, Scalar magnitude)
+{
+    return magnitude * cross(normalized(b - a), (Vector){0, 0, 1}); // Vortex
+}
+
+Vector
+simple_outward_flow_forcing(Vector a, Vector b, Scalar magnitude)
+{
+    return magnitude * (1 / length(b - a)) * normalized(b - a); // Outward flow
+}
+
+// The Pencil Code forcing_hel_noshear(), manual Eq. 222, inspired forcing function with adjustable
+// helicity
+Vector
+helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vector ff_im, Scalar phi)
+{
+    // JP: This looks wrong:
+    //    1) Should it be AC_dsx * AC_nx instead of AC_dsx * AC_ny?
+    //    2) Should you also use globalGrid.n instead of the local n?
+    //    MV: You are rigth. Made a quickfix. I did not see the error  because multigpu is split
+    //        in z direction not y direction.
+    //    3) Also final point: can we do this with vectors/quaternions instead?
+    //       Tringonometric functions are much more expensive and inaccurate/
+    //    MV: Good idea. No an immediate priority.
+    // Fun related article:
+    // https://randomascii.wordpress.com/2014/10/09/intel-underestimates-error-bounds-by-1-3-quintillion/
+    xx.x = xx.x * (2.0 * M_PI / (AC_dsx * globalGridN.x));
+    xx.y = xx.y * (2.0 * M_PI / (AC_dsy * globalGridN.y));
+    xx.z = xx.z * (2.0 * M_PI / (AC_dsz * globalGridN.z));
+
+    Scalar cos_phi     = cos(phi);
+    Scalar sin_phi     = sin(phi);
+    Scalar cos_k_dot_x = cos(dot(k_force, xx));
+    Scalar sin_k_dot_x = sin(dot(k_force, xx));
+    // Phase affect only the x-component
+    // Scalar real_comp       = cos_k_dot_x;
+    // Scalar imag_comp       = sin_k_dot_x;
+    Scalar real_comp_phase = cos_k_dot_x * cos_phi - sin_k_dot_x * sin_phi;
+    Scalar imag_comp_phase = cos_k_dot_x * sin_phi + sin_k_dot_x * cos_phi;
+
+    Vector force = (Vector){ff_re.x * real_comp_phase - ff_im.x * imag_comp_phase,
+                            ff_re.y * real_comp_phase - ff_im.y * imag_comp_phase,
+                            ff_re.z * real_comp_phase - ff_im.z * imag_comp_phase};
+
+    return force;
+}
+
+Vector
+forcing_OLD(int3 globalVertexIdx, Scalar dt)
+{
+    Vector a         = Scalar(.5) * (Vector){globalGridN.x * AC_dsx, globalGridN.y * AC_dsy,
+                                     globalGridN.z * AC_dsz}; // source (origin)
+    Vector xx        = (Vector){(globalVertexIdx.x - DCONST(AC_nx_min)) * AC_dsx,
+                         (globalVertexIdx.y - DCONST(AC_ny_min)) * AC_dsy,
+                         (globalVertexIdx.z - DCONST(AC_nz_min)) * AC_dsz}; // sink (current index)
+    const Scalar cs2 = AC_cs2_sound;
+    const Scalar cs  = sqrt(cs2);
+
+    // Placeholders until determined properly
+    Scalar magnitude = AC_forcing_magnitude;
+    Scalar phase     = AC_forcing_phase;
+    Vector k_force   = (Vector){AC_k_forcex, AC_k_forcey, AC_k_forcez};
+    Vector ff_re     = (Vector){AC_ff_hel_rex, AC_ff_hel_rey, AC_ff_hel_rez};
+    Vector ff_im     = (Vector){AC_ff_hel_imx, AC_ff_hel_imy, AC_ff_hel_imz};
+
+    // Determine that forcing funtion type at this point.
+    // Vector force = simple_vortex_forcing(a, xx, magnitude);
+    // Vector force = simple_outward_flow_forcing(a, xx, magnitude);
+    Vector force = helical_forcing(magnitude, k_force, xx, ff_re, ff_im, phase);
+
+    // Scaling N = magnitude*cs*sqrt(k*cs/dt)  * dt
+    const Scalar NN = cs * sqrt(AC_kaver * cs);
+    // MV: Like in the Pencil Code. I don't understandf the logic here.
+    force.x = sqrt(dt) * NN * force.x;
+    force.y = sqrt(dt) * NN * force.y;
+    force.z = sqrt(dt) * NN * force.z;
+
+    if (is_valid(force)) {
+        return force;
+    }
+    else {
+        return (Vector){0, 0, 0};
+    }
+}
+
+// PC-style helical forcing with support for profiles
+Vector
+forcing(int3 vertexIdx, int3 globalVertexIdx, Scalar dt, ScalarArray profx, ScalarArray profy,
+        ScalarArray profz, ScalarArray profx_hel, ScalarArray profy_hel, ScalarArray profz_hel)
+{
+    Vector pos = (Vector){(globalVertexIdx.x - DCONST(AC_nx_min)) * AC_dsx,
+                          (globalVertexIdx.y - DCONST(AC_ny_min)) * AC_dsy,
+                          (globalVertexIdx.z - DCONST(AC_nz_min)) * AC_dsz};
+
+    Complex fx = AC_fact * exp(Complex(0, AC_kk.x * AC_k1_ff * pos.z + AC_phase));
+    Complex fy = exp(Complex(0, AC_kk.y * AC_k1_ff * pos.y));
+
+    Complex fz;
+    if (AC_iforcing_zsym == 0) {
+        fz = exp(Complex(0., AC_kk.z * AC_k1_ff * pos.z));
+    }
+    else if (AC_iforcing_zsym == 1) {
+        fz = Complex(cos(AC_kk.z * AC_k1_ff * pos.z), 0);
+    }
+    else if (AC_iforcing_zsym == -1) {
+        fz = Complex(sin(AC_kk.z * AC_k1_ff * pos.z), 0);
+    }
+    else {
+        // Failure
+    }
+
+    Complex fxyz = fx * fy * fz;
+
+    // TODO recheck indices
+    Scalar force_ampl    = profx[vertexIdx.x - NGHOST] * profy[vertexIdx.y] * profz[vertexIdx.z];
+    Scalar prof_hel_ampl = profx_hel[vertexIdx.x - NGHOST] * profy_hel[vertexIdx.y] *
+                           profz_hel[vertexIdx.z];
+
+    return force_ampl * AC_fda * (Complex(AC_coef1.x, prof_hel_ampl * AC_coef2.x) * fxyz).x;
+}
+#endif // LFORCING
+
+// Declare input and output arrays using locations specified in the
+// array enum in astaroth.h
+in ScalarField lnrho(VTXBUF_LNRHO);
+out ScalarField out_lnrho(VTXBUF_LNRHO);
+
+in VectorField uu(VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+out VectorField out_uu(VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+
+#if LMAGNETIC
+in VectorField aa(VTXBUF_AX, VTXBUF_AY, VTXBUF_AZ);
+out VectorField out_aa(VTXBUF_AX, VTXBUF_AY, VTXBUF_AZ);
+#endif
+
+#if LENTROPY
+in ScalarField ss(VTXBUF_ENTROPY);
+out ScalarField out_ss(VTXBUF_ENTROPY);
+#endif
+
+#if LTEMPERATURE
+in ScalarField tt(VTXBUF_TEMPERATURE);
+out ScalarField out_tt(VTXBUF_TEMPERATURE);
+#endif
+
+Kernel void
+solve()
+{
+    Scalar dt = AC_dt;
+    out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt);
+
+#if LMAGNETIC
+    out_aa = rk3(out_aa, aa, induction(uu, aa), dt);
+#endif
+
+#if LENTROPY
+    out_uu = rk3(out_uu, uu, momentum(uu, lnrho, ss, aa), dt);
+    out_ss = rk3(out_ss, ss, entropy(ss, uu, lnrho, aa), dt);
+#elif LTEMPERATURE
+    out_uu = rk3(out_uu, uu, momentum(uu, lnrho, tt), dt);
+    out_tt = rk3(out_tt, tt, heat_transfer(uu, lnrho, tt), dt);
+#else
+    out_uu = rk3(out_uu, uu, momentum(uu, lnrho), dt);
+#endif
+
+#if LFORCING
+    if (step_number == 2) {
+        out_uu = out_uu + forcing(vertexIdx, globalVertexIdx, dt, AC_profx, AC_profy, AC_profz,
+                                  AC_profx_hel, AC_profy_hel, AC_profz_hel);
+    }
+#endif
+}
--- a/acc/src/acc.l
+++ b/acc/src/acc.l
@@ -15,8 +15,10 @@ L [a-zA-Z_]
 "void"      { return VOID; } /* Rest of the types inherited from C */
 "int"       { return INT; }
 "int3"      { return INT3; }
+"Complex"   { return COMPLEX; }
 "ScalarField" { return SCALARFIELD; }
 "VectorField" { return VECTOR; }
+"ScalarArray" { return SCALARARRAY; }

 "Kernel"       { return KERNEL; } /* Function specifiers */
 "Preprocessed" { return PREPROCESSED; }
--- a/acc/src/acc.y
+++ b/acc/src/acc.y
@@ -16,8 +16,8 @@ int yyget_lineno();
 %token CONSTANT IN OUT UNIFORM
 %token IDENTIFIER NUMBER
 %token RETURN
-%token SCALAR VECTOR MATRIX SCALARFIELD
-%token VOID INT INT3
+%token SCALAR VECTOR MATRIX SCALARFIELD SCALARARRAY
+%token VOID INT INT3 COMPLEX
 %token IF ELSE FOR WHILE ELIF
 %token LEQU LAND LOR LLEQU
 %token KERNEL PREPROCESSED
@@ -210,6 +210,8 @@ type_specifier: VOID
              | VECTOR                                                                  { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = VECTOR;  }
              | MATRIX                                                                  { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = MATRIX;  }
              | SCALARFIELD                                                             { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = SCALARFIELD;  }
+              | SCALARARRAY                                                             { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = SCALARARRAY;  }
+              | COMPLEX                                                                 { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = COMPLEX;  }
              ;

 identifier: IDENTIFIER                                                                  { $$ = astnode_create(NODE_IDENTIFIER, NULL, NULL); astnode_set_buffer(yytext, $$); }
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -61,6 +61,8 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
    [VECTOR]      = "AcReal3",
    [MATRIX]      = "AcMatrix",
    [SCALARFIELD] = "AcReal",
+    [SCALARARRAY] = "const AcReal* __restrict__",
+    [COMPLEX]     = "acComplex",
    // Type qualifiers
    [KERNEL] = "template <int step_number>  static __global__",
    //__launch_bounds__(RK_THREADBLOCK_SIZE,
@@ -380,20 +382,13 @@ traverse(const ASTNode* node)
        if (handle >= 0) { // The variable exists in the symbol table
            const Symbol* symbol = &symbol_table[handle];

-            // if (symbol->type_qualifier == OUT) {
-            //    printf("%s%s", inout_name_prefix, symbol->identifier);
-            //}
            if (symbol->type_qualifier == UNIFORM) {
-                printf("DCONST(%s) ", symbol->identifier);
-                /*
-                if (symbol->type_specifier == SCALAR)
-                    printf("DCONST_REAL(AC_%s) ", symbol->identifier);
-                else if (symbol->type_specifier == INT)
-                    printf("DCONST_INT(AC_%s) ", symbol->identifier);
-                else
-                    printf("INVALID UNIFORM type specifier %s with %s\n",
-                           translate(symbol->type_specifier), symbol->identifier);
-                           */
+                if (inside_kernel && symbol->type_specifier == SCALARARRAY) {
+                    printf("buffer.profiles[%s] ", symbol->identifier);
+                }
+                else {
+                    printf("DCONST(%s) ", symbol->identifier);
+                }
            }
            else {
                // Do a regular translation
@@ -613,6 +608,15 @@ generate_header(void)
    }
    printf("\n\n");

+    // Scalar arrays
+    printf("#define AC_FOR_SCALARARRAY_HANDLES(FUNC)");
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_specifier == SCALARARRAY) {
+            printf("\\\nFUNC(%s),", symbol_table[i].identifier);
+        }
+    }
+    printf("\n\n");
+
    /*
    printf("\n");
    printf("typedef struct {\n");
--- a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
+++ b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
@@ -45,23 +45,394 @@ The foundational work was done in (Väisälä, Pekkilä, 2017) and the library,

 # Astaroth API

-## Devices and nodes
+The Astroth application-programming interface (API) provides the means for controlling execution of user-defined and built-in functions on multiple graphics processing units. Functions in the API are prefixed with lower case ```ac```, while structures and data types are prefixed with capitalized ```Ac```. Compile-time constants, such as definitions and enumerations, have the prefix ```AC_```. All of the API functions return an AcResult value indicating either success or failure. The return codes are
+```C
+typedef enum { 
+    AC_SUCCESS = 0, 
+    AC_FAILURE = 1
+} AcResult;
+```
+
+The API is divided into layers which differ in the level of control provided over the execution. There are two primary layers:
+
+* Device layer
+    > Functions start with acDevice*.
+    > Provides control over a single GPU.
+    > All functions are asynchronous.
+
+* Node layer
+    > Functions start with acNode*.
+    > Provides control over multiple devices in a single node.
+    > All functions are asynchronous and executed concurrently on all devices in the node.
+    > Subsequent functions called in the same stream (see Section #Streams and synchronization) are guaranteed to be synchronous.
+
+Finally, a third layer is provided for convenience and backwards compatibility. 
+
+* Astaroth layer (deprecated)
+    > Functions start with ac* without Node or Device, f.ex. acInit().
+    > Provided for backwards compatibility.
+    > Essentially a wrapper for the Node layer. 
+    > All functions are guaranteed to be synchronous.
+
+There are also several helper functions defined in `include/astaroth_defines.h`, which can be used for, say, determining the size or performing index calculations within the simulation domain.
+
+
+## List of Astaroth API functions
+
+Here's a non-exhaustive list of astaroth API functions. For more info and an up-to-date list, see the corresponding header files in `include/astaroth_defines.h`, `include/astaroth.h`, `include/astaroth_node.h`, `include/astaroth_device.h`.
+
+### Initialization, quitting and helper functions
+
+Device layer.
+```C
+AcResult acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device);
+AcResult acDeviceDestroy(Device device);
+AcResult acDevicePrintInfo(const Device device);
+AcResult acDeviceAutoOptimize(const Device device);
+```
+
+Node layer.
+```C
+AcResult acNodeCreate(const int id, const AcMeshInfo node_config, Node* node);
+AcResult acNodeDestroy(Node node);
+AcResult acNodePrintInfo(const Node node);
+AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config);
+AcResult acNodeAutoOptimize(const Node node);
+```
+
+General helper functions.
+```C
+size_t acVertexBufferSize(const AcMeshInfo info);
+size_t acVertexBufferSizeBytes(const AcMeshInfo info);
+size_t acVertexBufferCompdomainSize(const AcMeshInfo info);
+size_t acVertexBufferCompdomainSizeBytes(const AcMeshInfo info);
+size_t acVertexBufferIdx(const int i, const int j, const int k, const AcMeshInfo info);
+```
+
+### Loading and storing
+
+Loading meshes and vertex buffers to device memory.
+```C
+AcResult acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh);
+AcResult acDeviceLoadMeshWithOffset(const Device device, const Stream stream,
+                                    const AcMesh host_mesh, const int3 src, const int3 dst,
+                                    const int num_vertices);
+AcResult acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
+                                  const VertexBufferHandle vtxbuf_handle);
+AcResult acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream,
+                                            const AcMesh host_mesh,
+                                            const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                            const int3 dst, const int num_vertices);
+
+AcResult acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh);
+AcResult acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
+                                  const int3 src, const int3 dst, const int num_vertices);
+AcResult acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
+                                const VertexBufferHandle vtxbuf_handle);
+AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream,
+                                          const AcMesh host_mesh,
+                                          const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                          const int3 dst, const int num_vertices);
+```
+
+Storing meshes and vertex buffer to host memory.
+```C
+AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh);
+AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
+                                     const int3 dst, const int num_vertices, AcMesh* host_mesh);
+AcResult acDeviceStoreVertexBuffer(const Device device, const Stream stream,
+                                   const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
+AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
+                                     const int3 dst, const int num_vertices, AcMesh* host_mesh);
+
+AcResult acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh);
+AcResult acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src,
+                                   const int3 dst, const int num_vertices, AcMesh* host_mesh);
+AcResult acNodeStoreVertexBuffer(const Node node, const Stream stream,
+                                 const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
+AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
+                                           const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                           const int3 dst, const int num_vertices,
+                                           AcMesh* host_mesh);
+```
+
+Transferring data between devices
+```C
+AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device);
+AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream,
+                                        const int3 src, const int3 dst, const int num_vertices,
+                                        Device* dst_device);
+AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
+                                      const VertexBufferHandle vtxbuf_handle, Device dst_device);
+AcResult acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
+                                                const VertexBufferHandle vtxbuf_handle,
+                                                const int3 src, const int3 dst,
+                                                const int num_vertices, Device dst_device);
+```
+
+Loading uniforms (device constants)
+```C
+AcResult acDeviceLoadScalarConstant(const Device device, const Stream stream,
+                                    const AcRealParam param, const AcReal value);
+AcResult acDeviceLoadVectorConstant(const Device device, const Stream stream,
+                                    const AcReal3Param param, const AcReal3 value);
+AcResult acDeviceLoadIntConstant(const Device device, const Stream stream, const AcIntParam param,
+                                 const int value);
+AcResult acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3Param param,
+                                  const int3 value);
+AcResult acDeviceLoadScalarArray(const Device device, const Stream stream,
+                                 const ScalarArrayHandle handle, const AcReal* data,
+                                 const size_t num);
+AcResult acDeviceLoadMeshInfo(const Device device, const Stream stream,
+                              const AcMeshInfo device_config);
+```
+
+
+### Computation
+
+```C
+AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
+                                  const int3 start, const int3 end, const AcReal dt);
+AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
+                                       const VertexBufferHandle vtxbuf_handle, const int3 start,
+                                       const int3 end);
+AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
+                                    const int3 end);
+AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
+                            const VertexBufferHandle vtxbuf_handle, AcReal* result);
+AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const ReductionType rtype,
+                           const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                           const VertexBufferHandle vtxbuf2, AcReal* result);
+
+AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
+                                const int3 start, const int3 end, const AcReal dt);
+AcResult acNodeIntegrate(const Node node, const AcReal dt);
+AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
+                                     const VertexBufferHandle vtxbuf_handle);
+AcResult acNodePeriodicBoundconds(const Node node, const Stream stream);
+AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
+                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
+AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
+                         const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                         const VertexBufferHandle vtxbuf2, AcReal* result);
+```
+
+### Stream synchronization
+
+All library functions that take a `Stream` as a parameter are asynchronous. When calling these functions,
+the control returns immediately back to the host even if the called device function has not yet completed.
+Therefore special care must be taken in order to ensure proper synchronization.
+
+Synchronization is done using `Stream` primitives, defined as
+```C
+typedef enum { STREAM_DEFAULT, STREAM_0, ..., STREAM_16, NUM_STREAMS } Stream;
+#define STREAM_ALL (NUM_STREAMS)
+```
+
+Functions queued in the same stream will be executed sequentially. If two or more consequent
+functions are queued in different streams, then these functions may execute in parallel. Finally,
+there is a barrier synchronization function, which blocks until all functions in some stream have
+completed. The Astaroth API provides barrier synchronization with functions `acDeviceSynchronize` and
+`acNodeSynchronize`. All streams can be synchronized at once by passing the alias `STREAM_ALL` to
+the synchronization function.
+
+Usage of streams is demonstrated with the following example.
+```C
+funcA(STREAM_0);
+funcB(STREAM_0); // Blocks until funcA has completed
+funcC(STREAM_1); // May execute in parallel with funcB
+synchronizeStream(STREAM_ALL); // Blocks until functions in all streams have completed
+funcD(STREAM_2); // Is started when command returns from synchronizeStream()
+```
+
+### Data synchronization
+
+Stream synchronization works in the same fashion on node and device layers. However, on the node
+layer one has to take in account that a portion of the mesh is shared between devices and that the
+data is always up to date.
+
+In stencil computations, the mesh is surrounded by a halo, where data is only used for updating grid
+points near the boundaries of the simulation domain. A portion of this halo is shared by neighboring
+devices. As there is no way of knowing when the user has completed operations on the mesh, the data
+communication between neighboring devices must be explicitly triggered. For this purpose, we provide
+the functions
+```C
+AcResult acNodeSynchronizeMesh(const Node node, const Stream stream);
+AcResult acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
+                                       const VertexBufferHandle vtxbuf_handle);
+
+```
+
+> **NOTE**: Local halos must be up to date before synchronizing the data. Local halos are the grid points outside the computational domain which are used only by a single device. The mesh is distributed to multiple devices by blocking along the z axis. If there are *n* devices and the z-dimension of the computational domain is *nz*, then each device is assigned *nz / n* two-dimensional planes. For example with two devices, the data block that has to be up to date ranges from *(0, 0, nz)* to *(mx, my, nz + 2 * NGHOST)* 
+
+### Input and output buffers
+
+The mesh is duplicated to input and output buffers for performance reasons. The input buffers are
+read-only in user-specified compute kernels, which allows us to read them via the texture cache optimized
+for spatially local memory accesses. The results of compute kernels are written into the output buffers.
+
+Since we allow the user to operate on subsets of the computational domain in user-specified kernels, we have no way to know when the output buffers are complete and can be swapped. Therefore the user should explicitly state when the input and output buffer should be swapped. This is done via the API calls
+```C
+AcResult acDeviceSwapBuffers(const Device device);
+AcResult acNodeSwapBuffers(const Node node);
+```
+> **NOTE**: All functions provided with the API operate on input buffers and ensure that the complete result is available in the input buffer when the function has completed. User-specified kernels are exceptions and write the result to output buffers. Therefore buffers have to be swapped only after calling user-specified kernels.
+
+## Devices
+
+`Device` is a handle to some single device and is used in device layer functions to specify which device should execute the function. A `Device` is created and destroyed with the following interface functions.
+```C
+AcResult acDeviceCreate(const int device_id, const AcMeshInfo device_config, Device* device);
+AcResult acDeviceDestroy(Device device);
+```
+
+## Nodes
+
+`Node` is a handle to some compute node which consists of multiple devices. The `Node` handle is used to specify which node the node layer functions should operate in. A node is created and destroyed with the following interface functions.
+```C
+AcResult acNodeCreate(const int id, const AcMeshInfo node_config, Node* node);
+AcResult acNodeDestroy(Node node);
+```
+
+The function acNodeCreate calls acDeviceCreate for all devices that are visible from the current process. After a node has been created, the devices in it can be retrived with the function
+```C
+AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config);
+```
+where DeviceConfiguration is defined as
+```C
+typedef struct {
+    int num_devices;
+    Device* devices; // Todo make this predefined s.t. the user/us do not have to alloc/free
+
+    Grid grid;
+    Grid subgrid;
+} DeviceConfiguration;
+```

 ## Meshes

-## Streams and synchronization
+Meshes are the primary structures for passing information to the library and kernels. The definition
+of a `Mesh` is declared as
+```C
+typedef struct {
+    int int_params[NUM_INT_PARAMS];
+    int3 int3_params[NUM_INT3_PARAMS];
+    AcReal real_params[NUM_REAL_PARAMS];
+    AcReal3 real3_params[NUM_REAL3_PARAMS];
+} AcMeshInfo;

-## Interface
+typedef struct {
+    AcReal* vertex_buffer[NUM_VTXBUF_HANDLES];
+    AcMeshInfo info;
+} AcMesh;
+```

+Several built-in parameters, such as the dimensions of the mesh, and all user-defined parameters
+are defined in the `AcMeshInfo` structure. Before passing AcMeshInfo to an initialization function,
+such as `acDeviceCreate()`, the built-in parameters `AC_nx, AC_ny, AC_nz` must be set. These
+parameters define the dimensions of the computational domain of the mesh. For example,
+```C
+AcMeshInfo info;
+info.int_params[AC_nx] = 128;
+info.int_params[AC_ny] = 64;
+info.int_params[AC_nz] = 32;
+
+Device device;
+acDeviceCreate(0, info, &device);
+```
+
+AcMesh is used in loading and storing data from host to device and vice versa. Before calling for
+example `acDeviceLoadMesh()`, one must ensure that all `NUM_VTXBUF_HANDLES` are pointers to valid
+arrays in host memory consisting of `mx * my * mz` elements and stored in order
+`i + j * mx + k * mx * my`, where `i`, `j` and `k` correspond to `x`, `y` and `z` coordinate
+indices, respectively. The mesh dimensions can be queried with 
+```C
+int mx = info.int_params[AC_mx];
+int my = info.int_params[AC_my];
+int mz = info.int_params[AC_mz];
+```
+after initialization.
+
+
+### Decomposition
+Grids and subgrids contain the dimensions of the the mesh decomposed to multiple devices.
+```C
+typedef struct {
+    int3 m; // Size of the simulation domain (includes the ghost zones)
+    int3 n; // Size of the computational domain (without ghost zones)
+} Grid;
+```
+
+As briefly discussed in the section Data synchronization, a `Mesh` is distributed to multiple devices
+by blocking the data along the *z*-axis. Given the mesh dimensions *(mx, my, mz)*, its computational
+domain *(nx, ny, nz)* and *n* number of devices, then each device is assigned a mesh of size
+*(mx, my, 2 * NGHOST + nz/n)* and a computational domain of size *(nx, ny, nz/n)*.
+
+Let *i* be the device id. The portion of the halos shared by neighboring devices is then *(0, 0, i * nz/n)* - *(mx, my, 2 * NGHOST + i * nz/n)*. The functions `acNodeSynchronizeVertexBuffer` and `acNodeSynchronizeMesh` communicate these shared areas among the devices in the node.
+
+## Integration, reductions and boundary conditions
+
+The library provides the following functions for integration, reductions and computing periodic boundary conditions.
+```C
+AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
+                                  const int3 start, const int3 end, const AcReal dt);
+AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
+                                       const VertexBufferHandle vtxbuf_handle, const int3 start,
+                                       const int3 end);
+AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
+                                    const int3 end);
+AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
+                            const VertexBufferHandle vtxbuf_handle, AcReal* result);
+AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const ReductionType rtype,
+                           const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                           const VertexBufferHandle vtxbuf2, AcReal* result);
+
+AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
+                                const int3 start, const int3 end, const AcReal dt);
+AcResult acNodeIntegrate(const Node node, const AcReal dt);
+AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
+                                     const VertexBufferHandle vtxbuf_handle);
+AcResult acNodePeriodicBoundconds(const Node node, const Stream stream);
+AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
+                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
+AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
+                         const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                         const VertexBufferHandle vtxbuf2, AcReal* result);
+```
+
+Finally, there's a library function that is automatically generated for all user-specified `Kernel`
+functions written with the Astaroth DSL,
+```C
+AcResult acDeviceKernel_##identifier(const Device device, const Stream stream,
+                                     const int3 start, const int3 end);
+```
+Where `##identifier` is replaced with the name of the user-specified kernel. For example, a device
+function `Kernel solve()` can be called with `acDeviceKernel_solve()` via the API.

 # Astaroth DSL

 ## Uniforms

+### Control flow and implementing switches
+// Runtime constants are as fast as compile-time constants as long as
+// 1) They are not placed in tight loops, especially those that inlcude global memory accesses, that could be unrolled
+// 2) They are not multiplied with each other
+// 3) At least 32 neighboring threads in the x-axis access the same constant
+
+// Safe and efficient to use as switches
+
 ## Vertex buffers

 ### Input and output buffers

 ## Built-in variables and functions

+## Functions
+
+### Kernel
+
+### Preprocessed
+
+
+

--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -127,9 +127,9 @@ typedef enum {
    STREAM_14,
    STREAM_15,
    STREAM_16,
-    NUM_STREAM_TYPES
+    NUM_STREAMS
 } Stream;
-#define STREAM_ALL (NUM_STREAM_TYPES)
+#define STREAM_ALL (NUM_STREAMS)

 #define AC_GEN_ID(X) X
 typedef enum {
@@ -156,6 +156,11 @@ typedef enum {
    NUM_REAL3_PARAMS
 } AcReal3Param;

+typedef enum {
+    AC_FOR_SCALARARRAY_HANDLES(AC_GEN_ID) //
+    NUM_SCALARARRAY_HANDLES
+} ScalarArrayHandle;
+
 typedef enum {
    AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) //
    NUM_VTXBUF_HANDLES
@@ -166,6 +171,7 @@ extern const char* intparam_names[];
 extern const char* int3param_names[];
 extern const char* realparam_names[];
 extern const char* real3param_names[];
+extern const char* scalararray_names[];
 extern const char* vtxbuf_names[];

 typedef struct {
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -61,6 +61,11 @@ AcResult acDeviceLoadIntConstant(const Device device, const Stream stream, const
 AcResult acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3Param param,
                                  const int3 value);

+/** */
+AcResult acDeviceLoadScalarArray(const Device device, const Stream stream,
+                                 const ScalarArrayHandle handle, const size_t start,
+                                 const AcReal* data, const size_t num);
+
 /** */
 AcResult acDeviceLoadMeshInfo(const Device device, const Stream stream,
                              const AcMeshInfo device_config);
--- a/scripts/compile_acc.sh
+++ b/scripts/compile_acc.sh
@@ -5,6 +5,7 @@ then
       echo "ASTAROTH_HOME environment variable not set, run \"source ./sourceme.sh\" in Astaroth home directory"
       exit 1
 fi
+OUTPUT_DIR=${PWD}

 KERNEL_DIR=${AC_HOME}"/src/core/kernels"
 ACC_DIR=${AC_HOME}"/acc"
@@ -25,11 +26,19 @@ while [ "$#" -gt 0 ]
 do
 	case $1 in
 		-h|--help)
-			echo "You can set a custom files for DSL under the path $AC_HOME/"
+			echo "This script will help to compile DSL to CUDA code."
+			echo "The resulting kernels will be stored to $OUTPUT_DIR."
+			echo "You can set a custom files for DSL under the path $AC_DIR/"
 			echo "Example:"
 			echo "compile_acc.sh -a custom_setup/custom_assembly.sas -p custom_setup/custom_process.sps --header custom_setup/custom_header.h"
 			exit 0
 			;;
+		-I|--include)
+			shift
+                        ACC_INCLUDE_DIR=${1}
+			shift
+                        echo "CUSTOM include dir!"
+			;;
 		--header)
 			shift
                        ACC_HEADER=${1}
@@ -63,11 +72,11 @@ ${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SAS}
 ${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SPS}
 ${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_HEADER}

-echo "Moving stencil_assembly.cuh -> ${AC_HOME}/src/core/kernels"
-mv stencil_assembly.cuh ${AC_HOME}/src/core/kernels
+echo "Moving stencil_assembly.cuh -> ${OUTPUT_DIR}"
+mv stencil_assembly.cuh ${OUTPUT_DIR}

-echo "Moving stencil_process.cuh -> ${AC_HOME}/src/core/kernels"
-mv stencil_process.cuh ${AC_HOME}/src/core/kernels
+echo "Moving stencil_process.cuh -> ${OUTPUT_DIR}"
+mv stencil_process.cuh ${OUTPUT_DIR}

-echo "Moving stencil_defines.cuh -> ${AC_HOME}/include"
-mv stencil_defines.h ${AC_HOME}/include
+echo "Moving stencil_defines.cuh -> ${OUTPUT_DIR}"
+mv stencil_defines.h ${OUTPUT_DIR}
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -22,15 +22,16 @@
 #include "math_utils.h" // int3 + int3

 #define AC_GEN_STR(X) #X
-const char* intparam_names[]   = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
+const char* intparam_names[]    = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
                                AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
-const char* int3param_names[]  = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
+const char* int3param_names[]   = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
                                 AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
-const char* realparam_names[]  = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
+const char* realparam_names[]   = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
                                 AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
-const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
+const char* real3param_names[]  = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
                                  AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
-const char* vtxbuf_names[]     = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
+const char* scalararray_names[] = {AC_FOR_SCALARARRAY_HANDLES(AC_GEN_STR)};
+const char* vtxbuf_names[]      = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
 #undef AC_GEN_STR

 static const int num_nodes = 1;
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -37,6 +37,8 @@
 typedef struct {
    AcReal* in[NUM_VTXBUF_HANDLES];
    AcReal* out[NUM_VTXBUF_HANDLES];
+
+    AcReal* profiles[NUM_SCALARARRAY_HANDLES];
 } VertexBufferArray;

 struct device_s {
@@ -44,7 +46,7 @@ struct device_s {
    AcMeshInfo local_config;

    // Concurrency
-    cudaStream_t streams[NUM_STREAM_TYPES];
+    cudaStream_t streams[NUM_STREAMS];

    // Memory
    VertexBufferArray vba;
@@ -97,6 +99,32 @@ DCONST(const VertexBufferHandle handle)
 //#define globalMeshN_min // Placeholder
 #define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
 //#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
+//#include <thrust/complex.h>
+// using namespace thrust;
+#include <cuComplex.h>
+#if AC_DOUBLE_PRECISION == 1
+typedef cuDoubleComplex acComplex;
+#define acComplex(x, y) make_cuDoubleComplex(x, y)
+#else
+typedef cuFloatComplex acComplex;
+#define acComplex(x, y) make_cuFloatComplex(x, y)
+#endif
+static __device__ inline acComplex
+exp(const acComplex& val)
+{
+    return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
+}
+static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
+{
+    return (acComplex){a * b.x, a * b.y};
+}
+
+static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
+{
+    return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
+}
+//#include <complex>
+
 #include "kernels/boundconds.cuh"
 #include "kernels/integration.cuh"
 #include "kernels/reductions.cuh"
@@ -135,16 +163,26 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
    printf("Success!\n");

    // Concurrency
-    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+    for (int i = 0; i < NUM_STREAMS; ++i) {
        cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
    }

    // Memory
+    // VBA in/out
    const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
    }
+    // VBA Profiles
+    const size_t profile_size_bytes = sizeof(AcReal) * max(device_config.int_params[AC_mx],
+                                                           max(device_config.int_params[AC_my],
+                                                               device_config.int_params[AC_mz]));
+    for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.profiles[i], profile_size_bytes));
+    }
+
+    // Reductions
    ERRCHK_CUDA_ALWAYS(
        cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
@@ -178,6 +216,10 @@ acDeviceDestroy(Device device)
        cudaFree(device->vba.in[i]);
        cudaFree(device->vba.out[i]);
    }
+    for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
+        cudaFree(device->vba.profiles[i]);
+    }
+
    cudaFree(device->reduce_scratchpad);
    cudaFree(device->reduce_result);

@@ -186,7 +228,7 @@ acDeviceDestroy(Device device)
 #endif

    // Concurrency
-    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+    for (int i = 0; i < NUM_STREAMS; ++i) {
        cudaStreamDestroy(device->streams[i]);
    }

@@ -405,6 +447,21 @@ acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3P
    return AC_SUCCESS;
 }

+AcResult
+acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarArrayHandle handle,
+                        const size_t start, const AcReal* data, const size_t num)
+{
+    cudaSetDevice(device->id);
+
+    ERRCHK(start + num <= max(device->local_config.int_params[AC_mx],
+                              max(device->local_config.int_params[AC_my],
+                                  device->local_config.int_params[AC_mz])));
+
+    ERRCHK_CUDA(cudaMemcpyAsync(&device->vba.profiles[handle][start], data, sizeof(data[0]) * num,
+                                cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
 AcResult
 acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
 {
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -70,11 +70,11 @@ create_rotz(const AcReal radians)
 #define cos __cosf
 #define exp __expf
 */
-#define sin sinf
-#define cos cosf
-#define exp expf
-#define rsqrt rsqrtf // hardware reciprocal sqrt
-#endif               // AC_DOUBLE_PRECISION == 0
+//#define sin sinf
+//#define cos cosf
+//#define exp expf
+//#define rsqrt rsqrtf // hardware reciprocal sqrt
+#endif // AC_DOUBLE_PRECISION == 0

 /*
 * =============================================================================
--- a/src/core/math_utils.h
+++ b/src/core/math_utils.h
@@ -124,6 +124,11 @@ static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
    return (AcReal3){a * b.x, a * b.y, a * b.z};
 }

+static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
+{
+    return (AcReal3){a * b.x, a * b.y, a * b.z};
+}
+
 static HOST_DEVICE_INLINE AcReal
 dot(const AcReal3& a, const AcReal3& b)
 {