Merge branch 'master' into alt_bcond_2020_09

2020-11-23 15:47:46 +08:00
parent 543c565e5d e3eb782213
commit 11eddabbd6
14 changed files with 238 additions and 65 deletions
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -107,7 +107,7 @@ main(int argc, char** argv)
        }
    }

-    const TestType test = TEST_STRONG_SCALING;
+    const TestType test = TEST_WEAK_SCALING;
    if (test == TEST_WEAK_SCALING) {
        uint3_64 decomp = decompose(nprocs);
        info.int_params[AC_nx] *= decomp.x;
@@ -126,10 +126,15 @@ main(int argc, char** argv)

    // GPU alloc & compute
    acGridInit(info);
+    acGridRandomize();
+
+    /*
    AcMesh model;
    acMeshCreate(info, &model);
    acMeshRandomize(&model);
    acGridLoadMesh(STREAM_DEFAULT, model);
+    */
+
    /*
    acGridLoadMesh(STREAM_DEFAULT, model);

@@ -154,7 +159,7 @@ main(int argc, char** argv)
    }*/

    // Percentiles
-    const size_t num_iters      = 1000;
+    const size_t num_iters      = 100;
    const double nth_percentile = 0.90;
    std::vector<double> results; // ms
    results.reserve(num_iters);
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -17,42 +17,48 @@ main(void)

        // Boilerplate
        fprintf(fp, "#!/bin/bash\n");
-        fprintf(fp, "#BATCH --job-name=astaroth\n");
-        fprintf(fp, "#SBATCH --account=project_2000403\n");
-        fprintf(fp, "#SBATCH --time=03:00:00\n");
-        fprintf(fp, "#SBATCH --mem=32000\n");
-        fprintf(fp, "#SBATCH --partition=gpu\n");
+        fprintf(fp, "#BATCH --job-name=astaroth\n");        // OK
+        fprintf(fp, "#SBATCH --account=project_2000403\n"); // OK
+        fprintf(fp, "#SBATCH --time=04:00:00\n");           // OK
+        fprintf(fp, "#SBATCH --mem=0\n");                   // OK
+        fprintf(fp, "#SBATCH --partition=gpu\n");           // OK
+        fprintf(fp, "#SBATCH --exclusive\n");               // OK
+        fprintf(fp, "#SBATCH --cpus-per-task=10\n");        // OK
        fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs);
+        // HACK: exclude misconfigured nodes on Puhti
+        fprintf(fp, "#SBATCH -x "
+                    "r04g[05-06],r02g02,r14g04,r04g07,r16g07,r18g[02-03],r15g08,r17g06,r13g04\n");
        // fprintf(fp, "#SBATCH --cpus-per-task=10\n");

        // nprocs, nodes, gpus
        const int max_gpus_per_node = 4;
        const int gpus_per_node     = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
        const int nodes             = (int)ceil((double)nprocs / max_gpus_per_node);
-        fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
-        fprintf(fp, "#SBATCH -n %d\n", nprocs);
-        fprintf(fp, "#SBATCH -N %d\n", nodes);
+        fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); // OK
+        fprintf(fp, "#SBATCH -n %d\n", nprocs);                     // OK
+        fprintf(fp, "#SBATCH -N %d\n", nodes);                      // OK
        // fprintf(fp, "#SBATCH --exclusive\n");
-        if (nprocs >= 4)
-            fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
+        // if (nprocs >= 4)
+        //    fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");

        // Modules
        // OpenMPI
        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n");
-        //fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa
-        //fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n");
-        //if (nprocs >= 32)
-        //    fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa
+        // fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); //
+        // https://www.open-mpi.org/fa fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n");
+        // if (nprocs >= 32)
+        //    fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); //
+        //    https://www.open-mpi.org/fa

        // HPCX
-        //fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
-        //fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0
+        // fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
+        // fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0

        // Profile and run
        // fprintf(fp, "mkdir -p profile_%d\n", nprocs);

        /*
-        const int nx = 256; // max size 1792;
+        const int nx = 256; // max size 2048;
        const int ny = nx;
        const int nz = nx;

@@ -67,11 +73,11 @@ main(void)
            "benchmark_decomp_1D",       "benchmark_decomp_2D",      "benchmark_decomp_3D",
            "benchmark_decomp_1D_comm",  "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm",
            "benchmark_meshsize_256",    "benchmark_meshsize_512",   "benchmark_meshsize_1024",
-            "benchmark_meshsize_1792",   "benchmark_stencilord_2",   "benchmark_stencilord_4",
+            "benchmark_meshsize_2048",   "benchmark_stencilord_2",   "benchmark_stencilord_4",
            "benchmark_stencilord_6",    "benchmark_stencilord_8",   "benchmark_timings_control",
            "benchmark_timings_comp",    "benchmark_timings_comm",   "benchmark_timings_default",
            "benchmark_timings_corners", "benchmark_weak_128",       "benchmark_weak_256",
-            "benchmark_weak_448",
+            "benchmark_weak_512",
        };
        for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) {
            int nn = 256;
@@ -79,14 +85,32 @@ main(void)
                nn = 512;
            else if (strcmp(files[i], "benchmark_meshsize_1024") == 0)
                nn = 1024;
-            else if (strcmp(files[i], "benchmark_meshsize_1792") == 0)
-                nn = 1792;
+            else if (strcmp(files[i], "benchmark_meshsize_2048") == 0)
+                nn = 2048;
            else if (strcmp(files[i], "benchmark_weak_128") == 0)
                nn = 128;
-            else if (strcmp(files[i], "benchmark_weak_448") == 0)
-                nn = 448;
+            else if (strcmp(files[i], "benchmark_weak_512") == 0)
+                nn = 512;

-            fprintf(fp, "$(cd %s && srun ./benchmark %d %d %d && cd ..)\n", files[i], nn, nn, nn);
+            // W/ Fredriks tunings
+            // (may cause Assertion `status == UCS_OK' failed errors)
+            // fprintf(fp,
+            //        "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy "
+            //        "UCX_MAX_RNDV_RAILS=1 srun ./benchmark %d %d %d && cd ..)\n",
+            //        files[i], nn, nn, nn);
+            if (nodes >= 2) {
+                fprintf(fp,
+                        "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy "
+                        "UCX_MAX_RNDV_RAILS=1 srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm "
+                        "-f core.* && cd ..)\n",
+                        files[i], nn, nn, nn);
+            }
+            else {
+                fprintf(fp,
+                        "$(cd %s && srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm -f core.* "
+                        "&& cd ..)\n",
+                        files[i], nn, nn, nn);
+            }
        }

        fclose(fp);
--- a/samples/standalone/simulation.cc
+++ b/samples/standalone/simulation.cc
@@ -43,7 +43,13 @@

 // NEED TO BE DEFINED HERE. IS NOT NOTICED BY compile_acc call.
 #define LFORCING (0)
+
+#ifdef VTXBUF_ACCRETION
+#define LSINK (1)
+#else
 #define LSINK (0)
+#endif
+
 #ifdef BFIELDX
 #define LBFIELD (1)
 #else
@@ -322,6 +328,7 @@ run_simulation(const char* config_path)
    // acmesh_init_to(INIT_TYPE_SIMPLE_CORE, mesh); //Initial condition for a collapse test

 #if LSINK
+    printf("WARNING! Sink particle is under development. USE AT YOUR OWN RISK!")
    vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh);
 #endif

@@ -388,18 +395,10 @@ run_simulation(const char* config_path)
    /* Step the simulation */
    AcReal accreted_mass = 0.0;
    AcReal sink_mass     = 0.0;
+    AcReal uu_freefall = 0.0;
    AcReal dt_typical    = 0.0;
    int dtcounter = 0;
    for (int i = start_step + 1; i < max_steps; ++i) {
-        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
-#if LBFIELD
-        const AcReal vAmax = acReduceVecScal(RTYPE_ALFVEN_MAX, BFIELDX, BFIELDY, BFIELDZ, VTXBUF_LNRHO);
-        const AcReal uref  = max(umax, vAmax); 
-        const AcReal dt   = host_timestep(uref, vAmax, mesh_info);
-#else
-        const AcReal dt   = host_timestep(umax, 0.0l, mesh_info);
-#endif
-
 #if LSINK

        const AcReal sum_mass = acReduceScal(RTYPE_SUM, VTXBUF_ACCRETION);
@@ -407,7 +406,7 @@ run_simulation(const char* config_path)
        sink_mass             = 0.0;
        sink_mass             = mesh_info.real_params[AC_M_sink_init] + accreted_mass;
        acLoadDeviceConstant(AC_M_sink, sink_mass);
-        vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh);
+        vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh); //TODO THIS IS A BUG! WILL ONLY SET HOST BUFFER 0! 

        int on_off_switch;
        if (i < 1) {
@@ -417,11 +416,26 @@ run_simulation(const char* config_path)
            on_off_switch = 1;
        }
        acLoadDeviceConstant(AC_switch_accretion, on_off_switch);
+
+        //Adjust courant condition for free fall velocity
+        const AcReal RR = mesh_info.real_params[AC_soft]*mesh_info.real_params[AC_soft];
+        const AcReal SQ2GM = sqrt(AcReal(2.0)*mesh_info.real_params[AC_G_const]*sink_mass);
+        uu_freefall = fabs(SQ2GM / sqrt(RR));
 #else
        accreted_mass = -1.0;
        sink_mass     = -1.0;
 #endif

+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+#if LBFIELD
+        const AcReal vAmax = acReduceVecScal(RTYPE_ALFVEN_MAX, BFIELDX, BFIELDY, BFIELDZ, VTXBUF_LNRHO);
+        const AcReal uref  = max(max(umax,uu_freefall), vAmax); 
+        const AcReal dt   = host_timestep(uref, vAmax, mesh_info);
+#else
+        const AcReal uref  = max(umax,uu_freefall); 
+        const AcReal dt   = host_timestep(uref, 0.0l, mesh_info);
+#endif
+
 #if LFORCING
        const ForcingParams forcing_params = generateForcingParams(mesh_info);
        loadForcingParamsToDevice(forcing_params);