diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..83f93b54b75bc9c0448eec2acc5bc1ac97f3e56f --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +CMakeLists.txt.user +CMakeCache.txt +CMakeFiles +CMakeScripts +Testing +Makefile +cmake_install.cmake +install_manifest.txt +compile_commands.json +CTestTestfile.cmake +_deps +*.out + +build +share/**/*.silo +.vscode diff --git a/CMakeLists.txt b/CMakeLists.txt index 4f7df4acd36e3f181435d255232d5886b7bdde42..49827e51bfaeda58908f547245aba8634cfd5175 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -281,6 +281,12 @@ option(OPT_EXPENSIVE_ASSERTS "Enable expensive assert()s" OFF) if (OPT_EXPENSIVE_ASSERTS) add_definitions(-DEXPENSIVE_ASSERTS) endif() + +option(NEW_CURLS "Use the new method for computing curls" OFF) +if (NEW_CURLS) + add_compile_definitions(NEW_CURLS) +endif() + ###################################################################### ## Enable/disable solver modules option(OPT_DEBUG_PRINT "Print debug information") @@ -317,6 +323,9 @@ if (OPT_BUILD_TESTS) add_executable(test_mass tests/test_mass.cpp) target_link_libraries(test_mass gmshdg ${LINK_LIBS}) + add_executable(test_curl tests/test_curl.cpp) + target_link_libraries(test_curl gmshdg ${LINK_LIBS}) + add_executable(test_differentiation tests/test_differentiation.cpp) target_link_libraries(test_differentiation gmshdg ${LINK_LIBS}) @@ -324,12 +333,18 @@ if (OPT_BUILD_TESTS) target_link_libraries(test_lifting gmshdg ${LINK_LIBS}) if (OPT_ENABLE_GPU_SOLVER) + add_executable(test_curl_gpu tests/test_curl_gpu.cpp) + target_link_libraries(test_curl_gpu gmshdg ${LINK_LIBS}) + add_executable(test_differentiation_gpu tests/test_differentiation_gpu.cpp) target_link_libraries(test_differentiation_gpu gmshdg ${LINK_LIBS}) add_executable(profile_differentiation_gpu tests/profile_differentiation_gpu.cpp) target_link_libraries(profile_differentiation_gpu gmshdg ${LINK_LIBS}) + add_executable(profile_curl_gpu tests/profile_curl_gpu.cpp) + target_link_libraries(profile_curl_gpu gmshdg ${LINK_LIBS}) + add_executable(test_lifting_gpu tests/test_lifting_gpu.cpp) target_link_libraries(test_lifting_gpu gmshdg ${LINK_LIBS}) endif() diff --git a/include/libgmshdg/kernels_cpu.h b/include/libgmshdg/kernels_cpu.h index fe14d5572715e7643a0a137c8b6e78557c88db2f..813418ce1dbdcbe80a70fd045d9f99abb898d516 100644 --- a/include/libgmshdg/kernels_cpu.h +++ b/include/libgmshdg/kernels_cpu.h @@ -19,6 +19,450 @@ struct kernel_cpu_sizes static const size_t num_all_fluxes = num_fluxes * num_faces; }; +template<size_t AK> +void compute_curl_kernel_planar(const entity_data_cpu& ed, + const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha, + vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz) +{ + /* Flop count: (45*ed.num_bf + 6)*ed.num_bf*num_total_elems */ + /* The elements must be ordered by orientation -> entity::sort_by_orientation() */ + + assert(ed.g_order == 1); + + using KS = kernel_cpu_sizes<AK>; + assert(ed.num_bf == KS::num_bf); + + size_t orient_elem_base = 0; + for (size_t iO = 0; iO < ed.num_orientations; iO++) + { + size_t num_elems = ed.num_elems[iO]; + size_t orient_base = orient_elem_base * ed.num_bf; + + #pragma omp parallel for //num_threads(2) + for (size_t iT = 0; iT < num_elems; iT++) + { + size_t ent_iT = orient_elem_base + iT; + size_t dofofs = ed.dof_base + orient_base + iT*ed.num_bf; + + for (size_t odof = 0; odof < KS::num_bf; odof++) + { + double acc_dfx_dy = 0.0; + double acc_dfx_dz = 0.0; + + double acc_dfy_dx = 0.0; + double acc_dfy_dz = 0.0; + + double acc_dfz_dx = 0.0; + double acc_dfz_dy = 0.0; + + for (size_t idof = 0; idof < KS::num_bf; idof++) + { + auto dm_row = iO*KS::num_bf + odof; + auto dm_col0 = 0*KS::num_bf + idof; + auto dm_col1 = 1*KS::num_bf + idof; + auto dm_col2 = 2*KS::num_bf + idof; + + auto dm_0 = ed.differentiation_matrices(dm_row, dm_col0); + auto dm_1 = ed.differentiation_matrices(dm_row, dm_col1); + auto dm_2 = ed.differentiation_matrices(dm_row, dm_col2); + + auto jbase = 3*ent_iT; + + double Fx = fx(dofofs+idof); + double v0_fx = dm_0 * Fx; + double v1_fx = dm_1 * Fx; + double v2_fx = dm_2 * Fx; + acc_dfx_dy += (ed.jacobians(jbase + 1, 0) * v0_fx + + ed.jacobians(jbase + 1, 1) * v1_fx + + ed.jacobians(jbase + 1, 2) * v2_fx); + acc_dfx_dz += (ed.jacobians(jbase + 2, 0) * v0_fx + + ed.jacobians(jbase + 2, 1) * v1_fx + + ed.jacobians(jbase + 2, 2) * v2_fx); + + double Fy = fy(dofofs+idof); + double v0_fy = dm_0 * Fy; + double v1_fy = dm_1 * Fy; + double v2_fy = dm_2 * Fy; + acc_dfy_dx += (ed.jacobians(jbase + 0, 0) * v0_fy + + ed.jacobians(jbase + 0, 1) * v1_fy + + ed.jacobians(jbase + 0, 2) * v2_fy); + acc_dfy_dz += (ed.jacobians(jbase + 2, 0) * v0_fy + + ed.jacobians(jbase + 2, 1) * v1_fy + + ed.jacobians(jbase + 2, 2) * v2_fy); + + double Fz = fz(dofofs+idof); + double v0_fz = dm_0 * Fz; + double v1_fz = dm_1 * Fz; + double v2_fz = dm_2 * Fz; + acc_dfz_dx += (ed.jacobians(jbase + 0, 0) * v0_fz + + ed.jacobians(jbase + 0, 1) * v1_fz + + ed.jacobians(jbase + 0, 2) * v2_fz); + acc_dfz_dy += (ed.jacobians(jbase + 1, 0) * v0_fz + + ed.jacobians(jbase + 1, 1) * v1_fz + + ed.jacobians(jbase + 1, 2) * v2_fz); + + } + + curl_fx(dofofs+odof) = alpha * (acc_dfz_dy - acc_dfy_dz); + curl_fy(dofofs+odof) = alpha * (acc_dfx_dz - acc_dfz_dx); + curl_fz(dofofs+odof) = alpha * (acc_dfy_dx - acc_dfx_dy); + + } + } + + orient_elem_base += num_elems; + } +} + +template<size_t AK> +void compute_curl_kernel_curved(const entity_data_cpu& ed, + const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha, + vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz) +{ + /* Flop count: ((45*ed.num_bf+12)*ed.num_bf*ed.num_qp + 6*((2*ed.num_bf-1) + 1)*ed.num_bf)*num_total_elems */ + /* The elements must be ordered by orientation -> entity::sort_by_orientation() */ + + assert(ed.g_order == 1); + + using KS = kernel_cpu_sizes<AK>; + assert(ed.num_bf == KS::num_bf); + + size_t orient_elem_base = 0; + for (size_t iO = 0; iO < ed.num_orientations; iO++) + { + size_t num_elems = ed.num_elems[iO]; + size_t orient_base = orient_elem_base * ed.num_bf; + + #pragma omp parallel for //num_threads(2) + for (size_t iT = 0; iT < num_elems; iT++) + { + size_t ent_iT = orient_elem_base + iT; + size_t dofofs = ed.dof_base + orient_base + iT*ed.num_bf; + + vecxd dfx_dy = vecxd::Zero(ed.num_bf); + vecxd dfx_dz = vecxd::Zero(ed.num_bf); + + vecxd dfy_dx = vecxd::Zero(ed.num_bf); + vecxd dfy_dz = vecxd::Zero(ed.num_bf); + + vecxd dfz_dx = vecxd::Zero(ed.num_bf); + vecxd dfz_dy = vecxd::Zero(ed.num_bf); + + for (size_t iQp = 0; iQp < ed.num_qp; iQp++) + { + auto det = ed.cell_determinants(ent_iT * ed.num_qp + iQp); + + for (size_t odof = 0; odof < KS::num_bf; odof++) + { + double acc_dfx_dy = 0.0; + double acc_dfx_dz = 0.0; + + double acc_dfy_dx = 0.0; + double acc_dfy_dz = 0.0; + + double acc_dfz_dx = 0.0; + double acc_dfz_dy = 0.0; + + for (size_t idof = 0; idof < KS::num_bf; idof++) + { + auto dm_row = iO * KS::num_bf * ed.num_qp + KS::num_bf * iQp + odof; + auto dm_col0 = 0 * KS::num_bf + idof; + auto dm_col1 = 1 * KS::num_bf + idof; + auto dm_col2 = 2 * KS::num_bf + idof; + + auto dm_0 = ed.differentiation_matrices(dm_row, dm_col0); + auto dm_1 = ed.differentiation_matrices(dm_row, dm_col1); + auto dm_2 = ed.differentiation_matrices(dm_row, dm_col2); + + auto jbase = 3 * ent_iT * ed.num_qp + 3 * iQp; + + double Fx = fx(dofofs+idof); + double v0_fx = dm_0 * Fx; + double v1_fx = dm_1 * Fx; + double v2_fx = dm_2 * Fx; + acc_dfx_dy += (ed.jacobians(jbase + 1, 0) * v0_fx + + ed.jacobians(jbase + 1, 1) * v1_fx + + ed.jacobians(jbase + 1, 2) * v2_fx); + acc_dfx_dz += (ed.jacobians(jbase + 2, 0) * v0_fx + + ed.jacobians(jbase + 2, 1) * v1_fx + + ed.jacobians(jbase + 2, 2) * v2_fx); + + double Fy = fy(dofofs+idof); + double v0_fy = dm_0 * Fy; + double v1_fy = dm_1 * Fy; + double v2_fy = dm_2 * Fy; + acc_dfy_dx += (ed.jacobians(jbase + 0, 0) * v0_fy + + ed.jacobians(jbase + 0, 1) * v1_fy + + ed.jacobians(jbase + 0, 2) * v2_fy); + acc_dfy_dz += (ed.jacobians(jbase + 2, 0) * v0_fy + + ed.jacobians(jbase + 2, 1) * v1_fy + + ed.jacobians(jbase + 2, 2) * v2_fy); + + double Fz = fz(dofofs+idof); + double v0_fz = dm_0 * Fz; + double v1_fz = dm_1 * Fz; + double v2_fz = dm_2 * Fz; + acc_dfz_dx += (ed.jacobians(jbase + 0, 0) * v0_fz + + ed.jacobians(jbase + 0, 1) * v1_fz + + ed.jacobians(jbase + 0, 2) * v2_fz); + acc_dfz_dy += (ed.jacobians(jbase + 1, 0) * v0_fz + + ed.jacobians(jbase + 1, 1) * v1_fz + + ed.jacobians(jbase + 1, 2) * v2_fz); + } + + dfx_dy(odof) += det * acc_dfx_dy; + dfx_dz(odof) += det * acc_dfx_dz; + + dfy_dx(odof) += det * acc_dfy_dx; + dfy_dz(odof) += det * acc_dfy_dz; + + dfz_dx(odof) += det * acc_dfz_dx; + dfz_dy(odof) += det * acc_dfz_dy; + + } + } + + dfx_dy = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfx_dy; + dfx_dz = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfx_dz; + + dfy_dx = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfy_dx; + dfy_dz = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfy_dz; + + dfz_dx = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfz_dx; + dfz_dy = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfz_dy; + + curl_fx.segment(dofofs, ed.num_bf) = alpha * (dfz_dy - dfy_dz); + curl_fy.segment(dofofs, ed.num_bf) = alpha * (dfx_dz - dfz_dx); + curl_fz.segment(dofofs, ed.num_bf) = alpha * (dfy_dx - dfx_dy); + } + + orient_elem_base += num_elems; + } +} + +template<size_t AK> +void compute_curl_kernel_planar(const entity_data_cpu& ed, + const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha, + vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz, + const vecxd& jx, const vecxd& jy, const vecxd& jz) +{ + /* Flop count: (45*ed.num_bf + 9)*ed.num_bf*num_total_elems */ + /* The elements must be ordered by orientation -> entity::sort_by_orientation() */ + + assert(ed.g_order == 1); + + using KS = kernel_cpu_sizes<AK>; + assert(ed.num_bf == KS::num_bf); + + size_t orient_elem_base = 0; + for (size_t iO = 0; iO < ed.num_orientations; iO++) + { + size_t num_elems = ed.num_elems[iO]; + size_t orient_base = orient_elem_base * ed.num_bf; + + #pragma omp parallel for //num_threads(2) + for (size_t iT = 0; iT < num_elems; iT++) + { + size_t ent_iT = orient_elem_base + iT; + size_t dofofs = ed.dof_base + orient_base + iT*ed.num_bf; + + for (size_t odof = 0; odof < KS::num_bf; odof++) + { + double acc_dfx_dy = 0.0; + double acc_dfx_dz = 0.0; + + double acc_dfy_dx = 0.0; + double acc_dfy_dz = 0.0; + + double acc_dfz_dx = 0.0; + double acc_dfz_dy = 0.0; + + for (size_t idof = 0; idof < KS::num_bf; idof++) + { + auto dm_row = iO*KS::num_bf + odof; + auto dm_col0 = 0*KS::num_bf + idof; + auto dm_col1 = 1*KS::num_bf + idof; + auto dm_col2 = 2*KS::num_bf + idof; + + auto dm_0 = ed.differentiation_matrices(dm_row, dm_col0); + auto dm_1 = ed.differentiation_matrices(dm_row, dm_col1); + auto dm_2 = ed.differentiation_matrices(dm_row, dm_col2); + + auto jbase = 3*ent_iT; + + double Fx = fx(dofofs+idof); + double v0_fx = dm_0 * Fx; + double v1_fx = dm_1 * Fx; + double v2_fx = dm_2 * Fx; + acc_dfx_dy += (ed.jacobians(jbase + 1, 0) * v0_fx + + ed.jacobians(jbase + 1, 1) * v1_fx + + ed.jacobians(jbase + 1, 2) * v2_fx); + acc_dfx_dz += (ed.jacobians(jbase + 2, 0) * v0_fx + + ed.jacobians(jbase + 2, 1) * v1_fx + + ed.jacobians(jbase + 2, 2) * v2_fx); + + double Fy = fy(dofofs+idof); + double v0_fy = dm_0 * Fy; + double v1_fy = dm_1 * Fy; + double v2_fy = dm_2 * Fy; + acc_dfy_dx += (ed.jacobians(jbase + 0, 0) * v0_fy + + ed.jacobians(jbase + 0, 1) * v1_fy + + ed.jacobians(jbase + 0, 2) * v2_fy); + acc_dfy_dz += (ed.jacobians(jbase + 2, 0) * v0_fy + + ed.jacobians(jbase + 2, 1) * v1_fy + + ed.jacobians(jbase + 2, 2) * v2_fy); + + double Fz = fz(dofofs+idof); + double v0_fz = dm_0 * Fz; + double v1_fz = dm_1 * Fz; + double v2_fz = dm_2 * Fz; + acc_dfz_dx += (ed.jacobians(jbase + 0, 0) * v0_fz + + ed.jacobians(jbase + 0, 1) * v1_fz + + ed.jacobians(jbase + 0, 2) * v2_fz); + acc_dfz_dy += (ed.jacobians(jbase + 1, 0) * v0_fz + + ed.jacobians(jbase + 1, 1) * v1_fz + + ed.jacobians(jbase + 1, 2) * v2_fz); + + } + + curl_fx(dofofs+odof) = alpha * (acc_dfz_dy - acc_dfy_dz) - jx(dofofs+odof); + curl_fy(dofofs+odof) = alpha * (acc_dfx_dz - acc_dfz_dx) - jy(dofofs+odof); + curl_fz(dofofs+odof) = alpha * (acc_dfy_dx - acc_dfx_dy) - jz(dofofs+odof); + + } + } + + orient_elem_base += num_elems; + } +} + +template<size_t AK> +void compute_curl_kernel_curved(const entity_data_cpu& ed, + const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha, + vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz, + const vecxd& jx, const vecxd& jy, const vecxd& jz) +{ + /* Flop count: ((45*ed.num_bf+12)*ed.num_bf*ed.num_qp + ((12*ed.num_bf-1) + 9)*ed.num_bf)*num_total_elems */ + /* The elements must be ordered by orientation -> entity::sort_by_orientation() */ + + assert(ed.g_order == 1); + + using KS = kernel_cpu_sizes<AK>; + assert(ed.num_bf == KS::num_bf); + + size_t orient_elem_base = 0; + for (size_t iO = 0; iO < ed.num_orientations; iO++) + { + size_t num_elems = ed.num_elems[iO]; + size_t orient_base = orient_elem_base * ed.num_bf; + + #pragma omp parallel for //num_threads(2) + for (size_t iT = 0; iT < num_elems; iT++) + { + size_t ent_iT = orient_elem_base + iT; + size_t dofofs = ed.dof_base + orient_base + iT*ed.num_bf; + + vecxd dfx_dy = vecxd::Zero(ed.num_bf); + vecxd dfx_dz = vecxd::Zero(ed.num_bf); + + vecxd dfy_dx = vecxd::Zero(ed.num_bf); + vecxd dfy_dz = vecxd::Zero(ed.num_bf); + + vecxd dfz_dx = vecxd::Zero(ed.num_bf); + vecxd dfz_dy = vecxd::Zero(ed.num_bf); + + for (size_t iQp = 0; iQp < ed.num_qp; iQp++) + { + auto det = ed.cell_determinants(ent_iT * ed.num_qp + iQp); + + for (size_t odof = 0; odof < KS::num_bf; odof++) + { + double acc_dfx_dy = 0.0; + double acc_dfx_dz = 0.0; + + double acc_dfy_dx = 0.0; + double acc_dfy_dz = 0.0; + + double acc_dfz_dx = 0.0; + double acc_dfz_dy = 0.0; + + for (size_t idof = 0; idof < KS::num_bf; idof++) + { + auto dm_row = iO * KS::num_bf * ed.num_qp + KS::num_bf * iQp + odof; + auto dm_col0 = 0 * KS::num_bf + idof; + auto dm_col1 = 1 * KS::num_bf + idof; + auto dm_col2 = 2 * KS::num_bf + idof; + + auto dm_0 = ed.differentiation_matrices(dm_row, dm_col0); + auto dm_1 = ed.differentiation_matrices(dm_row, dm_col1); + auto dm_2 = ed.differentiation_matrices(dm_row, dm_col2); + + auto jbase = 3 * ent_iT * ed.num_qp + 3 * iQp; + + double Fx = fx(dofofs+idof); + double v0_fx = dm_0 * Fx; + double v1_fx = dm_1 * Fx; + double v2_fx = dm_2 * Fx; + acc_dfx_dy += (ed.jacobians(jbase + 1, 0) * v0_fx + + ed.jacobians(jbase + 1, 1) * v1_fx + + ed.jacobians(jbase + 1, 2) * v2_fx); + acc_dfx_dz += (ed.jacobians(jbase + 2, 0) * v0_fx + + ed.jacobians(jbase + 2, 1) * v1_fx + + ed.jacobians(jbase + 2, 2) * v2_fx); + + double Fy = fy(dofofs+idof); + double v0_fy = dm_0 * Fy; + double v1_fy = dm_1 * Fy; + double v2_fy = dm_2 * Fy; + acc_dfy_dx += (ed.jacobians(jbase + 0, 0) * v0_fy + + ed.jacobians(jbase + 0, 1) * v1_fy + + ed.jacobians(jbase + 0, 2) * v2_fy); + acc_dfy_dz += (ed.jacobians(jbase + 2, 0) * v0_fy + + ed.jacobians(jbase + 2, 1) * v1_fy + + ed.jacobians(jbase + 2, 2) * v2_fy); + + double Fz = fz(dofofs+idof); + double v0_fz = dm_0 * Fz; + double v1_fz = dm_1 * Fz; + double v2_fz = dm_2 * Fz; + acc_dfz_dx += (ed.jacobians(jbase + 0, 0) * v0_fz + + ed.jacobians(jbase + 0, 1) * v1_fz + + ed.jacobians(jbase + 0, 2) * v2_fz); + acc_dfz_dy += (ed.jacobians(jbase + 1, 0) * v0_fz + + ed.jacobians(jbase + 1, 1) * v1_fz + + ed.jacobians(jbase + 1, 2) * v2_fz); + } + + dfx_dy(odof) += det * acc_dfx_dy; + dfx_dz(odof) += det * acc_dfx_dz; + + dfy_dx(odof) += det * acc_dfy_dx; + dfy_dz(odof) += det * acc_dfy_dz; + + dfz_dx(odof) += det * acc_dfz_dx; + dfz_dy(odof) += det * acc_dfz_dy; + + } + } + + dfx_dy = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfx_dy; + dfx_dz = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfx_dz; + + dfy_dx = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfy_dx; + dfy_dz = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfy_dz; + + dfz_dx = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfz_dx; + dfz_dy = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfz_dy; + + curl_fx.segment(dofofs, ed.num_bf) = alpha * (dfz_dy - dfy_dz) - jx.segment(dofofs, ed.num_bf); + curl_fy.segment(dofofs, ed.num_bf) = alpha * (dfx_dz - dfz_dx) - jy.segment(dofofs, ed.num_bf); + curl_fz.segment(dofofs, ed.num_bf) = alpha * (dfy_dx - dfx_dy) - jz.segment(dofofs, ed.num_bf); + } + + orient_elem_base += num_elems; + } +} + template<size_t AK> void compute_field_derivatives_kernel_planar(const entity_data_cpu& ed, const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz) @@ -87,7 +531,7 @@ template<size_t AK> void compute_field_derivatives_kernel_curved(const entity_data_cpu& ed, const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz) { - /* Flop count: ((21*ed.num_bf+6)*ed.num_bf*ed.num_qp + 3*(2*ed.num_bf-1)*ed.num_bf)*num_elements */ + /* Flop count: ((21*ed.num_bf+6)*ed.num_bf*ed.num_qp + 3*(2*ed.num_bf-1)*ed.num_bf)*num_total_elems */ /* The elements must be ordered by orientation -> entity::sort_by_orientation() */ assert(ed.g_order > 1); @@ -168,10 +612,16 @@ void compute_field_derivatives_kernel_curved(const entity_data_cpu& ed, } +void compute_field_curl(const entity_data_cpu& ed, + const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha, vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz); + +void compute_field_curl(const entity_data_cpu& ed, + const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha, vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz, + const vecxd& jx, const vecxd& jy, const vecxd& jz); + void compute_field_derivatives(const entity_data_cpu& ed, const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz); - template<size_t AK> void compute_lifting_kernel_planar(const entity_data_cpu& ed, const vecxd& flux, vecxd& field) diff --git a/include/libgmshdg/kernels_gpu.h b/include/libgmshdg/kernels_gpu.h index 3f5c216dd8ccb10cc76cbab931074e738628b8f2..eef7f5345a2e41069dcf6900ceb88d454321e799 100644 --- a/include/libgmshdg/kernels_gpu.h +++ b/include/libgmshdg/kernels_gpu.h @@ -21,6 +21,7 @@ struct kernel_gpu_sizes<1> static const size_t num_bf = num_dofs_3D(1); //4 static const size_t num_fluxes = num_dofs_2D(1); //3 + static const size_t curl_threads = 512; static const size_t deriv_threads = 128; static const size_t lifting_threads = 128; @@ -36,6 +37,7 @@ struct kernel_gpu_sizes<2> static const size_t num_bf = num_dofs_3D(2); //10 static const size_t num_fluxes = num_dofs_2D(2); //6 + static const size_t curl_threads = 128; static const size_t deriv_threads = 128; static const size_t lifting_threads = 128; @@ -51,6 +53,7 @@ struct kernel_gpu_sizes<3> static const size_t num_bf = num_dofs_3D(3); //20 static const size_t num_fluxes = num_dofs_2D(3); //10 + static const size_t curl_threads = 128; static const size_t deriv_threads = 128; static const size_t lifting_threads = 128; @@ -66,6 +69,7 @@ struct kernel_gpu_sizes<4> static const size_t num_bf = num_dofs_3D(4); //35 static const size_t num_fluxes = num_dofs_2D(4); + static const size_t curl_threads = 128; static const size_t deriv_threads = 512; static const size_t lifting_threads = 128; @@ -81,10 +85,11 @@ struct kernel_gpu_sizes<5> static const size_t num_bf = num_dofs_3D(5); static const size_t num_fluxes = num_dofs_2D(5); - static const size_t deriv_threads = 1024; + static const size_t curl_threads = 64; + static const size_t deriv_threads = 512; static const size_t lifting_threads = 128; - static const size_t cells_per_dblock = 32; + static const size_t cells_per_dblock = 32; static const size_t dofs_per_dblock = num_bf * cells_per_dblock; static const size_t dblock_size = 128; static const size_t parallel_dblocks = 4; @@ -96,10 +101,11 @@ struct kernel_gpu_sizes<6> static const size_t num_bf = num_dofs_3D(6); static const size_t num_fluxes = num_dofs_2D(6); - static const size_t deriv_threads = 1024; + static const size_t curl_threads = 64; + static const size_t deriv_threads = 512; static const size_t lifting_threads = 128; - static const size_t cells_per_dblock = 32; + static const size_t cells_per_dblock = 32; static const size_t dofs_per_dblock = num_bf * cells_per_dblock; static const size_t dblock_size = 128; static const size_t parallel_dblocks = 4; @@ -148,6 +154,18 @@ size_t gpu_dblocks_dofs(const entity_data_cpu&); #define FROM_GPU false void reshape_dofs(const entity_data_cpu&, const entity_data_gpu&, const vecxd&, vecxd&, bool); +void +gpu_compute_field_curl(const entity_data_gpu& edg, + const double *fx, const double *fy, const double *fz, const double alpha, + double *curl_fx, double* curl_fy, double* curl_fz, + gpuStream_t stream = 0); + +void +gpu_compute_field_curl(const entity_data_gpu& edg, + const double *fx, const double *fy, const double *fz, const double alpha, + double *curl_fx, double* curl_fy, double* curl_fz, + double *src_x, double *src_y, double *src_z, + gpuStream_t stream = 0); void gpu_compute_field_derivatives(const entity_data_gpu& edg, const double *F, diff --git a/include/maxwell/maxwell_interface.h b/include/maxwell/maxwell_interface.h index dd7900d54c475a5b50b66b8fdef1612e460f65ed..943a2853919855783050155310a58aa48163a661 100644 --- a/include/maxwell/maxwell_interface.h +++ b/include/maxwell/maxwell_interface.h @@ -301,9 +301,11 @@ struct solver_state_gpu double curr_time; size_t curr_timestep; +#ifndef NEW_CURLS field_gpu dx; field_gpu dy; field_gpu dz; +#endif field_gpu k1; field_gpu k2; diff --git a/share/validation/params_test_maxwell_resonator.lua b/share/validation/params_test_maxwell_resonator.lua index 6943e4b69511e79dc8d185641bcdb3da16f62f43..f6c513d9e4a32cd747a03f81bc79aa834b3dbd55 100644 --- a/share/validation/params_test_maxwell_resonator.lua +++ b/share/validation/params_test_maxwell_resonator.lua @@ -4,10 +4,10 @@ --[[ Problem setup ]]-- sim.name = "test_maxwell_resonator" -- simulation name sim.dt = 1e-12 -- timestep size -sim.timesteps = 5001 -- num of iterations +sim.timesteps = 501 -- num of iterations sim.gmsh_model = "resonator.geo" -- gmsh model filename -sim.use_gpu = 0 -- 0: cpu, 1: gpu -sim.approx_order = 1 -- approximation order +sim.use_gpu = 1 -- 0: cpu, 1: gpu +sim.approx_order = 3 -- approximation order sim.time_integrator = "leapfrog" postpro.silo_output_rate = 100 postpro.cycle_print_rate = 100 -- console print rate @@ -87,22 +87,22 @@ end debug.analytical_solution = ansol --debug.dump_cell_ranks = true -if ( do_IO ) then - fh = io.open("energy.txt", "w") -end - -function on_timestep(ts) - local e = compute_energy() - local err = compute_error() - if ( do_IO ) then - local tot_energy = e.Ex + e.Ey + e.Ez + e.Hx + e.Hy + e.Hz - local energy_err = 100*(tot_energy - 2*We)/(2*We) - fh:write(energy_err, " ", tot_energy, " ", e.Ey, " ", e.Hx, " ", e.Hz, "\n") - end -end - -function on_exit() - if ( do_IO ) then - fh:close() - end -end +--if ( do_IO ) then +-- fh = io.open("energy.txt", "w") +--end + +--function on_timestep(ts) + --local e = compute_energy() + --local err = compute_error() + --if ( do_IO ) then + -- local tot_energy = e.Ex + e.Ey + e.Ez + e.Hx + e.Hy + e.Hz + -- local energy_err = 100*(tot_energy - 2*We)/(2*We) + -- fh:write(energy_err, " ", tot_energy, " ", e.Ey, " ", e.Hx, " ", e.Hz, "\n") + --end +--end + +--function on_exit() +-- if ( do_IO ) then +-- fh:close() +-- end +--end diff --git a/src/libgmshdg/kernels_cpu.cpp b/src/libgmshdg/kernels_cpu.cpp index a1a2ecfebaa3e40a2dfe26afa2d84289d0f4a300..72231d23dcc4a83bbbb6708ef75a5e60d945274b 100644 --- a/src/libgmshdg/kernels_cpu.cpp +++ b/src/libgmshdg/kernels_cpu.cpp @@ -10,6 +10,113 @@ #include "libgmshdg/entity_data.h" #include "libgmshdg/kernels_cpu.h" +void compute_field_curl(const entity_data_cpu& ed, + const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha, + vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz) +{ + switch (ed.a_order) + { + case 1: + if (ed.g_order == 1) + compute_curl_kernel_planar<1>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz); + else + compute_curl_kernel_curved<1>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz); + break; + + case 2: + if (ed.g_order == 1) + compute_curl_kernel_planar<2>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz); + else + compute_curl_kernel_curved<2>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz); + break; + + case 3: + if (ed.g_order == 1) + compute_curl_kernel_planar<3>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz); + else + compute_curl_kernel_curved<3>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz); + break; + + case 4: + if (ed.g_order == 1) + compute_curl_kernel_planar<4>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz); + else + compute_curl_kernel_curved<4>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz); + break; + + case 5: + if (ed.g_order == 1) + compute_curl_kernel_planar<5>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz); + else + compute_curl_kernel_curved<5>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz); + break; + + case 6: + if (ed.g_order == 1) + compute_curl_kernel_planar<6>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz); + else + compute_curl_kernel_curved<6>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz); + break; + + default: + std::cout << "compute_curl: invalid order" << std::endl; + } +} + +void compute_field_curl(const entity_data_cpu& ed, + const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha, + vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz, + const vecxd& jx, const vecxd& jy, const vecxd& jz) +{ + switch (ed.a_order) + { + case 1: + if (ed.g_order == 1) + compute_curl_kernel_planar<1>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz); + else + compute_curl_kernel_curved<1>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz); + break; + + case 2: + if (ed.g_order == 1) + compute_curl_kernel_planar<2>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz); + else + compute_curl_kernel_curved<2>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz); + break; + + case 3: + if (ed.g_order == 1) + compute_curl_kernel_planar<3>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz); + else + compute_curl_kernel_curved<3>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz); + break; + + case 4: + if (ed.g_order == 1) + compute_curl_kernel_planar<4>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz); + else + compute_curl_kernel_curved<4>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz); + break; + + case 5: + if (ed.g_order == 1) + compute_curl_kernel_planar<5>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz); + else + compute_curl_kernel_curved<5>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz); + break; + + case 6: + if (ed.g_order == 1) + compute_curl_kernel_planar<6>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz); + else + compute_curl_kernel_curved<6>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz); + break; + + default: + std::cout << "compute_curl_sources: invalid order" << std::endl; + } +} + /* Field derivatives kernel, case for elements with planar faces */ void compute_field_derivatives(const entity_data_cpu& ed, const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz) diff --git a/src/libgmshdg/kernels_cuda.cu b/src/libgmshdg/kernels_cuda.cu index ac9b391ced4ba938772f9f9213583150cfaefc86..78a3c34d5da675ca5b920601e277444ed4cf9780 100644 --- a/src/libgmshdg/kernels_cuda.cu +++ b/src/libgmshdg/kernels_cuda.cu @@ -9,6 +9,147 @@ #include "libgmshdg/entity_data.h" #include "libgmshdg/kernels_gpu.h" +template<size_t K> +__global__ void +gpu_curl_planar(const double *fx, const double *fy, const double *fz, + const double * __restrict__ J, gpuTextureObject_t DM_tex, double alpha, + double * __restrict__ curl_fx, double * __restrict__ curl_fy, double * __restrict__ curl_fz, + int32_t num_all_elems, const int32_t* orients, int32_t dof_base) +{ + using KS = kernel_gpu_sizes<K>; + + const int32_t ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x); + if (ofs_in_entity >= num_all_elems*KS::num_bf) + return; + + const int32_t output_dof_offset = dof_base + ofs_in_entity; + + const int32_t iT = ofs_in_entity / KS::num_bf; + const int32_t elem_dof_base = dof_base + iT*KS::num_bf; + const int32_t iO = orients[iT]; + const int32_t DM_row = ofs_in_entity % KS::num_bf; + const int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO; + const int32_t DM_base = DM_orient + DM_row; + const int32_t jac_ofs = 9*iT; + + double acc_dfx_dy = 0.0; + double acc_dfx_dz = 0.0; + + double acc_dfy_dx = 0.0; + double acc_dfy_dz = 0.0; + + double acc_dfz_dx = 0.0; + double acc_dfz_dy = 0.0; + + for (int32_t dof = 0; dof < KS::num_bf; dof++) + { + int32_t f_ofs = elem_dof_base + dof; + double dm_0 = fetch_tex(DM_tex, DM_base + 3*KS::num_bf*dof); + double dm_1 = fetch_tex(DM_tex, DM_base + 3*KS::num_bf*dof + KS::num_bf); + double dm_2 = fetch_tex(DM_tex, DM_base + 3*KS::num_bf*dof + 2*KS::num_bf); + double Fx = fx[f_ofs]; + double Fy = fy[f_ofs]; + double Fz = fz[f_ofs]; + + acc_dfx_dy += J[jac_ofs+3]*dm_0*Fx; + acc_dfx_dy += J[jac_ofs+4]*dm_1*Fx; + acc_dfx_dy += J[jac_ofs+5]*dm_2*Fx; + acc_dfx_dz += J[jac_ofs+6]*dm_0*Fx; + acc_dfx_dz += J[jac_ofs+7]*dm_1*Fx; + acc_dfx_dz += J[jac_ofs+8]*dm_2*Fx; + + acc_dfy_dx += J[jac_ofs+0]*dm_0*Fy; + acc_dfy_dx += J[jac_ofs+1]*dm_1*Fy; + acc_dfy_dx += J[jac_ofs+2]*dm_2*Fy; + acc_dfy_dz += J[jac_ofs+6]*dm_0*Fy; + acc_dfy_dz += J[jac_ofs+7]*dm_1*Fy; + acc_dfy_dz += J[jac_ofs+8]*dm_2*Fy; + + acc_dfz_dx += J[jac_ofs+0]*dm_0*Fz; + acc_dfz_dx += J[jac_ofs+1]*dm_1*Fz; + acc_dfz_dx += J[jac_ofs+2]*dm_2*Fz; + acc_dfz_dy += J[jac_ofs+3]*dm_0*Fz; + acc_dfz_dy += J[jac_ofs+4]*dm_1*Fz; + acc_dfz_dy += J[jac_ofs+5]*dm_2*Fz; + } + + curl_fx[output_dof_offset] = alpha * (acc_dfz_dy - acc_dfy_dz); + curl_fy[output_dof_offset] = alpha * (acc_dfx_dz - acc_dfz_dx); + curl_fz[output_dof_offset] = alpha * (acc_dfy_dx - acc_dfx_dy); +} + +template<size_t K> +__global__ void +gpu_curl_planar(const double *fx, const double *fy, const double *fz, + const double * __restrict__ J, gpuTextureObject_t DM_tex, double alpha, + double * __restrict__ curl_fx, double * __restrict__ curl_fy, double * __restrict__ curl_fz, + const double * __restrict__ src_x, double * __restrict__ src_y, double * __restrict__ src_z, + int32_t num_all_elems, const int32_t* orients, int32_t dof_base) +{ + using KS = kernel_gpu_sizes<K>; + + const int32_t ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x); + if (ofs_in_entity >= num_all_elems*KS::num_bf) + return; + + const int32_t output_dof_offset = dof_base + ofs_in_entity; + + const int32_t iT = ofs_in_entity / KS::num_bf; + const int32_t elem_dof_base = dof_base + iT*KS::num_bf; + const int32_t iO = orients[iT]; + const int32_t DM_row = ofs_in_entity % KS::num_bf; + const int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO; + const int32_t DM_base = DM_orient + DM_row; + const int32_t jac_ofs = 9*iT; + + double acc_dfx_dy = 0.0; + double acc_dfx_dz = 0.0; + + double acc_dfy_dx = 0.0; + double acc_dfy_dz = 0.0; + + double acc_dfz_dx = 0.0; + double acc_dfz_dy = 0.0; + + for (int32_t dof = 0; dof < KS::num_bf; dof++) + { + int32_t f_ofs = elem_dof_base + dof; + double dm_0 = fetch_tex(DM_tex, DM_base + 3*KS::num_bf*dof); + double dm_1 = fetch_tex(DM_tex, DM_base + 3*KS::num_bf*dof + KS::num_bf); + double dm_2 = fetch_tex(DM_tex, DM_base + 3*KS::num_bf*dof + 2*KS::num_bf); + double Fx = fx[f_ofs]; + double Fy = fy[f_ofs]; + double Fz = fz[f_ofs]; + + acc_dfx_dy += J[jac_ofs+3]*dm_0*Fx; + acc_dfx_dy += J[jac_ofs+4]*dm_1*Fx; + acc_dfx_dy += J[jac_ofs+5]*dm_2*Fx; + acc_dfx_dz += J[jac_ofs+6]*dm_0*Fx; + acc_dfx_dz += J[jac_ofs+7]*dm_1*Fx; + acc_dfx_dz += J[jac_ofs+8]*dm_2*Fx; + + //F = fy[f_ofs]; + acc_dfy_dx += J[jac_ofs+0]*dm_0*Fy; + acc_dfy_dx += J[jac_ofs+1]*dm_1*Fy; + acc_dfy_dx += J[jac_ofs+2]*dm_2*Fy; + acc_dfy_dz += J[jac_ofs+6]*dm_0*Fy; + acc_dfy_dz += J[jac_ofs+7]*dm_1*Fy; + acc_dfy_dz += J[jac_ofs+8]*dm_2*Fy; + + //F = fz[f_ofs]; + acc_dfz_dx += J[jac_ofs+0]*dm_0*Fz; + acc_dfz_dx += J[jac_ofs+1]*dm_1*Fz; + acc_dfz_dx += J[jac_ofs+2]*dm_2*Fz; + acc_dfz_dy += J[jac_ofs+3]*dm_0*Fz; + acc_dfz_dy += J[jac_ofs+4]*dm_1*Fz; + acc_dfz_dy += J[jac_ofs+5]*dm_2*Fz; + } + + curl_fx[output_dof_offset] = alpha * (acc_dfz_dy - acc_dfy_dz) - src_x[output_dof_offset]; + curl_fy[output_dof_offset] = alpha * (acc_dfx_dz - acc_dfz_dx) - src_y[output_dof_offset]; + curl_fz[output_dof_offset] = alpha * (acc_dfy_dx - acc_dfx_dy) - src_z[output_dof_offset]; +} + /* compute α∇F */ template<size_t K> __global__ void @@ -65,63 +206,57 @@ gpu_deriv_planar(const double *F, const double * __restrict__ J, } template<size_t K> -__global__ void -gpu_deriv_planar_blocked(gpuTextureObject_t F, const double * __restrict__ J, - gpuTextureObject_t DM_tex, double * __restrict__ dF_dx, - double * __restrict__ dF_dy, double * __restrict__ dF_dz, - int32_t num_all_elems, int32_t* orients, int32_t dof_base) +void +launch_curl_kernel(const entity_data_gpu& edg, + const double *fx, const double *fy, const double *fz, const double alpha, + double *curl_fx, double* curl_fy, double* curl_fz, + cudaStream_t stream) { + auto J = edg.jacobians.data(); + auto Dtex = edg.differentiation_matrices.get_texture(); + int32_t num_elems = edg.num_all_elems; + auto orients = edg.element_orientation.data(); + auto num_orients = edg.element_orientation.size(); + using KS = kernel_gpu_sizes<K>; - if (threadIdx.x >= KS::dofs_per_dblock) - return; - const int32_t y_idx = (blockIdx.y * blockDim.y + threadIdx.y); - const int32_t elem_base = y_idx * KS::cells_per_dblock; - const int32_t elem_ofs = threadIdx.x / KS::num_bf; - const int32_t iT = elem_base + elem_ofs; + auto num_blocks = edg.num_bf*edg.num_all_elems/KS::curl_threads; + if (edg.num_bf*edg.num_all_elems % KS::curl_threads) + num_blocks += 1; - if (iT >= num_all_elems) - return; - - const int32_t block_dof_base = y_idx * KS::dblock_size; + if (edg.g_order == 1) + gpu_curl_planar<K><<<num_blocks, KS::curl_threads, 0, stream>>>(fx, fy, fz, + J, Dtex, alpha, curl_fx, curl_fy, curl_fz, num_elems, orients, edg.dof_base); + //else + // compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz); +} - const int32_t output_dof_offset = dof_base + block_dof_base + threadIdx.x; +template<size_t K> +void +launch_curl_kernel(const entity_data_gpu& edg, + const double *fx, const double *fy, const double *fz, const double alpha, + double *curl_fx, double* curl_fy, double* curl_fz, + double *src_x, double *src_y, double *src_z, + cudaStream_t stream) +{ + auto J = edg.jacobians.data(); + auto Dtex = edg.differentiation_matrices.get_texture(); + int32_t num_elems = edg.num_all_elems; + auto orients = edg.element_orientation.data(); + auto num_orients = edg.element_orientation.size(); - const int32_t elem_dof_base = dof_base + block_dof_base + elem_ofs*KS::num_bf; - const int32_t iO = orients[iT]; - const int32_t DM_row = threadIdx.x % KS::num_bf; - const int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO; - const int32_t DM_base = DM_orient + DM_row; - const int32_t jac_ofs = 9*iT; + using KS = kernel_gpu_sizes<K>; - double accm_dF_dx = 0.0; - double accm_dF_dy = 0.0; - double accm_dF_dz = 0.0; - for (int32_t dof = 0; dof < KS::num_bf; dof++) - { - int32_t d_ofs = DM_base + 3*KS::num_bf*dof; - int32_t f_ofs = elem_dof_base + dof; - double v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs); - accm_dF_dx += J[jac_ofs+0] * v; - accm_dF_dy += J[jac_ofs+3] * v; - accm_dF_dz += J[jac_ofs+6] * v; - - d_ofs = DM_base + 3*KS::num_bf*dof + KS::num_bf; - v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs); - accm_dF_dx += J[jac_ofs+1] * v; - accm_dF_dy += J[jac_ofs+4] * v; - accm_dF_dz += J[jac_ofs+7] * v; - - d_ofs = DM_base + 3*KS::num_bf*dof + 2*KS::num_bf; - v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs); - accm_dF_dx += J[jac_ofs+2] * v; - accm_dF_dy += J[jac_ofs+5] * v; - accm_dF_dz += J[jac_ofs+8] * v; - } + auto num_blocks = edg.num_bf*edg.num_all_elems/KS::curl_threads; + if (edg.num_bf*edg.num_all_elems % KS::curl_threads) + num_blocks += 1; - dF_dx[output_dof_offset] = accm_dF_dx; - dF_dy[output_dof_offset] = accm_dF_dy; - dF_dz[output_dof_offset] = accm_dF_dz; + if (edg.g_order == 1) + gpu_curl_planar<K><<<num_blocks, KS::curl_threads, 0, stream>>>(fx, fy, fz, + J, Dtex, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, + num_elems, orients, edg.dof_base); + //else + // compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz); } template<size_t K> @@ -149,6 +284,83 @@ launch_deriv_kernel(const entity_data_gpu& edg, // compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz); } +void +gpu_compute_field_curl(const entity_data_gpu& edg, + const double *fx, const double *fy, const double *fz, const double alpha, + double *curl_fx, double* curl_fy, double* curl_fz, + cudaStream_t stream) +{ + + switch (edg.a_order) + { + case 1: + launch_curl_kernel<1>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, stream); + break; + + case 2: + launch_curl_kernel<2>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, stream); + break; + + case 3: + launch_curl_kernel<3>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, stream); + break; + + case 4: + launch_curl_kernel<4>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, stream); + break; + + case 5: + launch_curl_kernel<5>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, stream); + break; + + case 6: + launch_curl_kernel<6>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, stream); + break; + + default: + throw std::invalid_argument("compute_field_curl: invalid order"); + } +} + +void +gpu_compute_field_curl(const entity_data_gpu& edg, + const double *fx, const double *fy, const double *fz, const double alpha, + double *curl_fx, double* curl_fy, double* curl_fz, + double *src_x, double *src_y, double *src_z, + cudaStream_t stream) +{ + + switch (edg.a_order) + { + case 1: + launch_curl_kernel<1>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, stream); + break; + + case 2: + launch_curl_kernel<2>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, stream); + break; + + case 3: + launch_curl_kernel<3>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, stream); + break; + + case 4: + launch_curl_kernel<4>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, stream); + break; + + case 5: + launch_curl_kernel<5>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, stream); + break; + + case 6: + launch_curl_kernel<6>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, stream); + break; + + default: + throw std::invalid_argument("compute_field_curl: invalid order"); + } +} + void gpu_compute_field_derivatives(const entity_data_gpu& edg, const double *f, double *df_dx, double* df_dy, double* df_dz, double alpha, diff --git a/src/maxwell/maxwell_cpu.cpp b/src/maxwell/maxwell_cpu.cpp index d72bb78262cca87973b4cde8bb333d260f541395..bd04eb3a38b1d4efce227c42198ec8387028bf54 100644 --- a/src/maxwell/maxwell_cpu.cpp +++ b/src/maxwell/maxwell_cpu.cpp @@ -112,6 +112,35 @@ init_from_model(const model& mod, solver_state& state, const maxwell::parameter_ /************************************************************************** * Computational kernels: curls */ +#ifdef NEW_CURLS +static void +compute_curls(solver_state& state, const field& curr, field& next) +{ + for (const auto& ed : state.eds) + { + compute_field_curl(ed, curr.Ex, curr.Ey, curr.Ez, -1.0, next.Hx, next.Hy, next.Hz); + compute_field_curl(ed, curr.Hx, curr.Hy, curr.Hz, 1.0, next.Ex, next.Ey, next.Ez, state.Jx_src, state.Jy_src, state.Jz_src); + } +} + +static void +compute_curls_E(solver_state& state, const field& curr, field& next) +{ + for (const auto& ed : state.eds) + { + compute_field_curl(ed, curr.Ex, curr.Ey, curr.Ez, -1.0, next.Hx, next.Hy, next.Hz); + } +} + +static void +compute_curls_H(solver_state& state, const field& curr, field& next) +{ + for (const auto& ed : state.eds) + { + compute_field_curl(ed, curr.Hx, curr.Hy, curr.Hz, 1.0, next.Ex, next.Ey, next.Ez, state.Jx_src, state.Jy_src, state.Jz_src); + } +} +#else static void compute_curls(solver_state& state, const field& curr, field& next) { @@ -162,6 +191,8 @@ compute_curls_H(solver_state& state, const field& curr, field& next) next.Ey = state.dz.Hx - state.dx.Hz - state.Jy_src; next.Ez = state.dx.Hy - state.dy.Hx - state.Jz_src; } +#endif + /************************************************************************** diff --git a/src/maxwell/maxwell_gpu.cpp b/src/maxwell/maxwell_gpu.cpp index e4941e7af16e994b7fc1a68f99b2ce5130db6bb6..be7fe862c25faa885a7919b61b2c9d30fc4b66cc 100644 --- a/src/maxwell/maxwell_gpu.cpp +++ b/src/maxwell/maxwell_gpu.cpp @@ -128,10 +128,12 @@ void init_from_model(const model& mod, solver_state_gpu& state, const maxwell::p { state.emf_curr.resize( mod.num_dofs() ); state.emf_next.resize( mod.num_dofs() ); + +#ifndef NEW_CURLS state.dx.resize( mod.num_dofs() ); state.dy.resize( mod.num_dofs() ); state.dz.resize( mod.num_dofs() ); - +#endif state.jumps.resize( mod.num_fluxes() ); state.fluxes.resize( mod.num_fluxes() ); @@ -188,6 +190,74 @@ compress_bndsrc(const solver_state_gpu& state, const field& srcs, pinned_field& } } +#ifdef NEW_CURLS +static void +compute_curls(solver_state_gpu& state, const field_gpu&curr, field_gpu& next, bool use_sources) +{ + auto currp = curr.data(); + auto nextp = next.data(); + auto Jx = state.Jx_src_gpu.data(); + auto Jy = state.Jy_src_gpu.data(); + auto Jz = state.Jz_src_gpu.data(); + + if (use_sources) + { + for (const auto& edg : state.edgs) + { + gpu_compute_field_curl(edg, currp.Ex, currp.Ey, currp.Ez, -1.0, nextp.Hx, nextp.Hy, nextp.Hz, state.compute_stream); + gpu_compute_field_curl(edg, currp.Hx, currp.Hy, currp.Hz, 1.0, nextp.Ex, nextp.Ey, nextp.Ez, Jx, Jy, Jz, state.compute_stream); + } + } + else { + for (const auto& edg : state.edgs) + { + gpu_compute_field_curl(edg, currp.Ex, currp.Ey, currp.Ez, -1.0, nextp.Hx, nextp.Hy, nextp.Hz, state.compute_stream); + gpu_compute_field_curl(edg, currp.Hx, currp.Hy, currp.Hz, 1.0, nextp.Ex, nextp.Ey, nextp.Ez, state.compute_stream); + } + + } +} + +static void +compute_curls_E(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) +{ + auto currp = curr.data(); + auto nextp = next.data(); + + for (const auto& edg : state.edgs) + { + gpu_compute_field_curl(edg, currp.Ex, currp.Ey, currp.Ez, -1.0, nextp.Hx, nextp.Hy, nextp.Hz, state.compute_stream); + } +} + +static void +compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources) +{ + auto currp = curr.data(); + auto nextp = next.data(); + auto Jx = state.Jx_src_gpu.data(); + auto Jy = state.Jy_src_gpu.data(); + auto Jz = state.Jz_src_gpu.data(); + + + if (use_sources) + { + for (const auto& edg : state.edgs) + { + gpu_compute_field_curl(edg, currp.Hx, currp.Hy, currp.Hz, 1.0, nextp.Ex, nextp.Ey, nextp.Ez, Jx, Jy, Jz, state.compute_stream); + } + + } + else + { + for (const auto& edg : state.edgs) + { + gpu_compute_field_curl(edg, currp.Hx, currp.Hy, currp.Hz, 1.0, nextp.Ex, nextp.Ey, nextp.Ez, state.compute_stream); + } + } + +} +#else static void compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources) { @@ -282,8 +352,8 @@ compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream); gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream); } - } +#endif static void compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources) diff --git a/src/maxwell/maxwell_solver.cpp b/src/maxwell/maxwell_solver.cpp index 9f39c94a5973b58f93b5d83c00da9dafe3a75e9c..d5e3efb992a9ccc19928655db1c214209da23a1a 100644 --- a/src/maxwell/maxwell_solver.cpp +++ b/src/maxwell/maxwell_solver.cpp @@ -247,6 +247,7 @@ void solver_mainloop(const model& mod, State& state, maxwell::parameter_loader& do_sources(mod, state, mpl); swap(state, mpl); + cudaDeviceSynchronize(); double time = tc.toc(); total_iteration_time += time; diff --git a/tests/profile_curl_gpu.cpp b/tests/profile_curl_gpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..90999b70941662942b9740d798b1a51370722472 --- /dev/null +++ b/tests/profile_curl_gpu.cpp @@ -0,0 +1,168 @@ +/* This is GMSH/DG, a GPU-Accelerated Nodal Discontinuous Galerkin + * solver for Conservation Laws. + * + * Copyright (C) 2020-2022 Matteo Cicuttin - University of Liège + * + * This code is released under GNU AGPLv3 license, see LICENSE.txt for details. + */ + +#include <iostream> +#include <iomanip> + +#include <unistd.h> + +#include "test.h" +#include "sgr.hpp" +#include "libgmshdg/gmsh_io.h" +#include "libgmshdg/entity_data.h" +#include "libgmshdg/kernels_cpu.h" +#include "timecounter.h" + +#include "libgmshdg/kernels_gpu.h" + +using namespace sgr; + +static void +make_geometry(int order, double mesh_h) +{ + gm::add("difftest"); + gmo::addBox(0.0, 0.0, 0.0, 1.0, 1.0, 1.0); + gmo::synchronize(); + + gvp_t vp; + gm::getEntities(vp); + gmm::setSize(vp, mesh_h); +} + +int profile_differentiation(int geometric_order, int approximation_order) +{ + std::cout << cyanfg << "Testing geometric order " << geometric_order; + std::cout << ", approximation order = " << approximation_order << nofg; + std::cout << std::endl; + + auto fx = [](const point_3d& pt) -> double { + return std::sin(M_PI*pt.y())*std::sin(M_PI*pt.z()); + }; + auto fy = [](const point_3d& pt) -> double { + return std::sin(M_PI*pt.z())*std::sin(M_PI*pt.x()); + }; + auto fz = [](const point_3d& pt) -> double { + return std::sin(M_PI*pt.x())*std::sin(M_PI*pt.y()); + }; + + auto cfx = [](const point_3d& pt) -> double { + return M_PI*std::sin(M_PI*pt.x())*std::cos(M_PI*pt.y()) - M_PI*std::cos(M_PI*pt.z())*std::sin(M_PI*pt.x()); + }; + auto cfy = [](const point_3d& pt) -> double { + return M_PI*std::sin(M_PI*pt.y())*std::cos(M_PI*pt.z()) - M_PI*std::cos(M_PI*pt.x())*std::sin(M_PI*pt.y()); + }; + auto cfz = [](const point_3d& pt) -> double { + return M_PI*std::sin(M_PI*pt.z())*std::cos(M_PI*pt.x()) - M_PI*std::cos(M_PI*pt.y())*std::sin(M_PI*pt.z()); + }; + + double h = 0.04; + make_geometry(0,h); + gmm::generate( DIMENSION(3) ); + + model mod(geometric_order, approximation_order); + mod.build(); + + auto& e0 = mod.entity_at(0); + e0.sort_by_orientation(); + + vecxd proj_fx = e0.project(fx); + vecxd proj_fy = e0.project(fy); + vecxd proj_fz = e0.project(fz); + + /* Make CPU entity data */ + entity_data_cpu ed; + e0.populate_entity_data(ed, mod); + /* Derive GPU entity data */ + entity_data_gpu edg(ed); + + auto model_num_dofs = mod.num_dofs(); + + /* Prepare I/O vectors and call kernel */ + device_vector<double> proj_fx_gpu(proj_fx.data(), proj_fx.size()); + device_vector<double> proj_fy_gpu(proj_fy.data(), proj_fy.size()); + device_vector<double> proj_fz_gpu(proj_fz.data(), proj_fz.size()); + + device_vector<double> comp_cfx_gpu(model_num_dofs); + device_vector<double> comp_cfy_gpu(model_num_dofs); + device_vector<double> comp_cfz_gpu(model_num_dofs); + + timecounter_gpu tc; + tc.tic(); + gpu_compute_field_curl(edg, proj_fx_gpu.data(), proj_fy_gpu.data(), proj_fz_gpu.data(), 1.0, + comp_cfx_gpu.data(), comp_cfy_gpu.data(), comp_cfz_gpu.data()); + double time = tc.toc(); + + vecxd comp_cfx = vecxd::Zero(model_num_dofs); + vecxd comp_cfy = vecxd::Zero(model_num_dofs); + vecxd comp_cfz = vecxd::Zero(model_num_dofs); + + comp_cfx_gpu.copyout( comp_cfx.data() ); + comp_cfy_gpu.copyout( comp_cfy.data() ); + comp_cfz_gpu.copyout( comp_cfz.data() ); + + if (geometric_order == 1) + { + std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: "; + double flops = (45*ed.num_bf + 3)*ed.num_bf*e0.num_cells(); + std::cout << flops/(1e9*time) << " GFlops/s" << std::endl; + } + else + { + std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: "; + double flops = ((21*ed.num_bf+6)*ed.num_bf*ed.num_qp + 3*(2*ed.num_bf-1)*ed.num_bf)*e0.num_cells(); + std::cout << flops/(1e9*time) << " GFlops/s" << std::endl; + } + + vecxd proj_cfx = e0.project(cfx); + vecxd proj_cfy = e0.project(cfy); + vecxd proj_cfz = e0.project(cfz); + + double err_x = 0.0; + double err_y = 0.0; + double err_z = 0.0; + + for (size_t iT = 0; iT < e0.num_cells(); iT++) + { + auto& pe = e0.cell(iT); + auto& re = e0.cell_refelem(pe); + matxd mass = e0.mass_matrix(iT); + auto num_bf = re.num_basis_functions(); + vecxd diff_x = comp_cfx.segment(iT*num_bf, num_bf) - proj_cfx.segment(iT*num_bf, num_bf); + vecxd diff_y = comp_cfy.segment(iT*num_bf, num_bf) - proj_cfy.segment(iT*num_bf, num_bf); + vecxd diff_z = comp_cfz.segment(iT*num_bf, num_bf) - proj_cfz.segment(iT*num_bf, num_bf); + + err_x += diff_x.dot(mass*diff_x); + err_y += diff_y.dot(mass*diff_y); + err_z += diff_z.dot(mass*diff_z); + } + std::cout << "Errors: " << std::sqrt(err_x) << " " << std::sqrt(err_y); + std::cout << " " << std::sqrt(err_z) << std::endl; + + return 0; +} + +int main(int argc, const char *argv[]) +{ + gmsh::initialize(); + gmsh::option::setNumber("General.Terminal", 0); + gmsh::option::setNumber("Mesh.Algorithm", 1); + gmsh::option::setNumber("Mesh.Algorithm3D", 1); + + if (argc != 3) + { + std::cout << argv[0] << "<gorder> <aorder>" << std::endl; + return 1; + } + + auto go = atoi(argv[1]); + auto ao = atoi(argv[2]); + + profile_differentiation(go, ao); + + return 0; +} diff --git a/tests/profile_differentiation_gpu.cpp b/tests/profile_differentiation_gpu.cpp index 88d7767d690eeab71b9c0e33c8b8d3980a9120c5..ed8b71ef41c0d10214c222ebee82cb134546c5e7 100644 --- a/tests/profile_differentiation_gpu.cpp +++ b/tests/profile_differentiation_gpu.cpp @@ -55,6 +55,8 @@ int profile_differentiation(int geometric_order, int approximation_order) double h = 0.04; make_geometry(0,h); + gmm::generate( DIMENSION(3) ); + model mod(geometric_order, approximation_order); mod.build(); @@ -104,7 +106,7 @@ int profile_differentiation(int geometric_order, int approximation_order) vecxd Pdf_dx_e0 = e0.project(df_dx); vecxd Pdf_dy_e0 = e0.project(df_dy); vecxd Pdf_dz_e0 = e0.project(df_dz); - +/* double err_x = 0.0; double err_y = 0.0; double err_z = 0.0; @@ -125,7 +127,7 @@ int profile_differentiation(int geometric_order, int approximation_order) } std::cout << "Errors: " << std::sqrt(err_x) << " " << std::sqrt(err_y); std::cout << " " << std::sqrt(err_z) << std::endl; - +*/ return 0; } diff --git a/tests/test_curl.cpp b/tests/test_curl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ee57fcf1a6a24e930c9865ce4159b7fc45c6fe39 --- /dev/null +++ b/tests/test_curl.cpp @@ -0,0 +1,251 @@ +/* This is GMSH/DG, a GPU-Accelerated Nodal Discontinuous Galerkin + * solver for Conservation Laws. + * + * Copyright (C) 2020-2022 Matteo Cicuttin - University of Liège + * + * This code is released under GNU AGPLv3 license, see LICENSE.txt for details. + */ + +#include <iostream> +#include <iomanip> + +#include <unistd.h> + +#include "test.h" +#include "sgr.hpp" +#include "timecounter.h" +#include "libgmshdg/gmsh_io.h" +#include "libgmshdg/entity_data.h" +#include "libgmshdg/kernels_cpu.h" +#include "libgmshdg/silo_output.hpp" + +using namespace sgr; + +static void +make_geometry(int order, double mesh_h) +{ + gm::add("difftest"); + + std::vector<std::pair<int,int>> objects; + + objects.push_back( + std::make_pair(3, gmo::addBox(0.0, 0.0, 0.0, 1.0, 1.0, 0.5) ) + ); + objects.push_back( + std::make_pair(3, gmo::addBox(0.0, 0.0, 0.5, 0.5, 0.5, 0.5) ) + ); + + std::vector<std::pair<int, int>> tools; + gmsh::vectorpair odt; + std::vector<gmsh::vectorpair> odtm; + gmo::fragment(objects, tools, odt, odtm); + + gmo::synchronize(); + + gvp_t vp; + gm::getEntities(vp); + gmm::setSize(vp, mesh_h); +} + +int test_curl_convergence(int geometric_order, int approximation_order) +{ + std::vector<double> sizes({ 0.32, 0.16, 0.08, 0.04 }); + std::vector<double> errors; + + std::cout << std::endl << cyanfg << "Testing geometric order " << geometric_order; + std::cout << ", approximation order = " << approximation_order << nofg; + std::cout << std::endl; + + auto fx = [](const point_3d& pt) -> double { + return std::sin(M_PI*pt.y())*std::sin(M_PI*pt.z()); + }; + auto fy = [](const point_3d& pt) -> double { + return std::sin(M_PI*pt.z())*std::sin(M_PI*pt.x()); + }; + auto fz = [](const point_3d& pt) -> double { + return std::sin(M_PI*pt.x())*std::sin(M_PI*pt.y()); + }; + + auto cfx = [](const point_3d& pt) -> double { + return M_PI*std::sin(M_PI*pt.x())*std::cos(M_PI*pt.y()) - M_PI*std::cos(M_PI*pt.z())*std::sin(M_PI*pt.x()); + }; + auto cfy = [](const point_3d& pt) -> double { + return M_PI*std::sin(M_PI*pt.y())*std::cos(M_PI*pt.z()) - M_PI*std::cos(M_PI*pt.x())*std::sin(M_PI*pt.y()); + }; + auto cfz = [](const point_3d& pt) -> double { + return M_PI*std::sin(M_PI*pt.z())*std::cos(M_PI*pt.x()) - M_PI*std::cos(M_PI*pt.y())*std::sin(M_PI*pt.z()); + }; + + std::vector<double> errs_x; + std::vector<double> errs_y; + std::vector<double> errs_z; + + for (size_t i = 0; i < sizes.size(); i++) + { + double h = sizes[i]; + make_geometry(0,h); + gmm::generate( DIMENSION(3) ); + + model mod(geometric_order, approximation_order); + mod.build(); + + std::stringstream ss; + ss << "diff_go_" << geometric_order << "_ao_" << approximation_order; + ss << "_seq_" << i << ".silo"; + +#ifdef WRITE_TEST_OUTPUTS + silo silodb; + silodb.create_db(ss.str()); + silodb.import_mesh_from_gmsh(); + silodb.write_mesh(); +#endif + auto model_num_dofs = mod.num_dofs(); + + vecxd proj_fx = vecxd::Zero(model_num_dofs); + vecxd proj_fy = vecxd::Zero(model_num_dofs); + vecxd proj_fz = vecxd::Zero(model_num_dofs); + + vecxd proj_cfx = vecxd::Zero(model_num_dofs); + vecxd proj_cfy = vecxd::Zero(model_num_dofs); + vecxd proj_cfz = vecxd::Zero(model_num_dofs); + + vecxd comp_cfx = vecxd::Zero(model_num_dofs); + vecxd comp_cfy = vecxd::Zero(model_num_dofs); + vecxd comp_cfz = vecxd::Zero(model_num_dofs); + + std::vector<entity_data_cpu> eds; + + for (auto& e : mod) + { + e.project(fx, proj_fx); + e.project(fy, proj_fy); + e.project(fz, proj_fz); + + e.project(cfx, proj_cfx); + e.project(cfy, proj_cfy); + e.project(cfz, proj_cfz); + + entity_data_cpu ed; + e.populate_entity_data(ed, mod); + eds.push_back( std::move(ed) ); + } + + for (auto& ed : eds) + { + timecounter tc; + tc.tic(); + compute_field_curl(ed, proj_fx, proj_fy, proj_fz, 1.0, comp_cfx, comp_cfy, comp_cfz); + double time = tc.toc(); + + auto num_cells = num_elems_all_orientations(ed); + auto num_bf = num_dofs_3D(approximation_order); + auto field_size = num_cells*num_bf; + + if (geometric_order == 1) + { + std::cout << num_cells << " elements. Kernel runtime: " << time << " seconds. Estimated performance: "; + double flops = (45*ed.num_bf+6)*ed.num_bf*num_cells; + std::cout << flops/(1e9*time) << " GFlops/s" << std::endl; + + + double gbytes = (6*field_size // input curl and accumulators + +num_cells*3*3)*8/1e9; + std::cout << "Kernel bandwidth: " << gbytes/time << " GB/s" << std::endl; + } + else + { + std::cout << num_cells << " elements. Kernel runtime: " << time << " seconds. Estimated performance: "; + double flops = ((45*ed.num_bf+12)*ed.num_bf*ed.num_qp + 6*((2*ed.num_bf-1) + 1)*ed.num_bf)*num_cells; + std::cout << flops/(1e9*time) << " GFlops/s" << std::endl; + } + } + + double err_x = 0.0; + double err_y = 0.0; + double err_z = 0.0; + +#ifdef WRITE_TEST_OUTPUTS + auto model_num_cells = mod.num_cells(); + std::vector<double> var_cfx(model_num_cells); + std::vector<double> var_cfy(model_num_cells); + std::vector<double> var_cfz(model_num_cells); +#endif + + for (auto& e : mod) + { + for (size_t iT = 0; iT < e.num_cells(); iT++) + { + auto& pe = e.cell(iT); + auto& re = e.cell_refelem(pe); + matxd mass = e.mass_matrix(iT); + auto num_bf = re.num_basis_functions(); + auto ofs = e.cell_model_dof_offset(iT); + vecxd diff_x = proj_cfx.segment(ofs, num_bf) - comp_cfx.segment(ofs, num_bf); + vecxd diff_y = proj_cfy.segment(ofs, num_bf) - comp_cfy.segment(ofs, num_bf); + vecxd diff_z = proj_cfz.segment(ofs, num_bf) - comp_cfz.segment(ofs, num_bf); + + err_x += diff_x.dot(mass*diff_x); + err_y += diff_y.dot(mass*diff_y); + err_z += diff_z.dot(mass*diff_z); + +#ifdef WRITE_TEST_OUTPUTS + auto gi = e.cell_global_index_by_gmsh(iT); + assert(gi < model_num_cells); + vecxd phi_bar = re.basis_functions({1./3., 1./3., 1./3.}); + var_cfx[gi] = comp_cfx.segment(ofs, num_bf).dot(phi_bar); + var_cfy[gi] = comp_cfy.segment(ofs, num_bf).dot(phi_bar); + var_cfz[gi] = comp_cfz.segment(ofs, num_bf).dot(phi_bar); +#endif + } + std::cout << "Errors: " << std::sqrt(err_x) << " " << std::sqrt(err_y); + std::cout << " " << std::sqrt(err_z) << std::endl; + } + +#ifdef WRITE_TEST_OUTPUTS + silodb.write_zonal_variable("cfx", var_cfx); + silodb.write_zonal_variable("cfy", var_cfy); + silodb.write_zonal_variable("cfz", var_cfz); +#endif + + errs_x.push_back( std::sqrt(err_x) ); + errs_y.push_back( std::sqrt(err_y) ); + errs_z.push_back( std::sqrt(err_z) ); + std::cout << std::endl; + } + + double rate_x = 0.0; + double rate_y = 0.0; + double rate_z = 0.0; + + std::cout << Byellowfg << "rate df/dx rate df/dy rate df/dz" << reset << std::endl; + for (size_t i = 1; i < sizes.size(); i++) + { + std::cout << (rate_x = std::log2(errs_x[i-1]/errs_x[i]) ) << " "; + std::cout << (rate_y = std::log2(errs_y[i-1]/errs_y[i]) ) << " "; + std::cout << (rate_z = std::log2(errs_z[i-1]/errs_z[i]) ) << std::endl; + } + + COMPARE_VALUES_ABSOLUTE("df/dx", rate_x, double(approximation_order), 0.2); + COMPARE_VALUES_ABSOLUTE("df/dy", rate_y, double(approximation_order), 0.2); + COMPARE_VALUES_ABSOLUTE("df/dz", rate_z, double(approximation_order), 0.2); + + return 0; +} + +int main(void) +{ + gmsh::initialize(); + gmsh::option::setNumber("General.Terminal", 0); + gmsh::option::setNumber("Mesh.Algorithm", 1); + gmsh::option::setNumber("Mesh.Algorithm3D", 1); + + + int failed_tests = 0; + + std::cout << Bmagentafg << " *** TESTING: CURL ***" << reset << std::endl; + for (size_t go = 1; go < 2; go++) + for (size_t ao = 3; ao < 5; ao++) + failed_tests += test_curl_convergence(go, ao); + + return failed_tests; +} diff --git a/tests/test_curl_gpu.cpp b/tests/test_curl_gpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d10b51d30b5884cd56d2e6d950eaa55af0d0baba --- /dev/null +++ b/tests/test_curl_gpu.cpp @@ -0,0 +1,264 @@ +/* This is GMSH/DG, a GPU-Accelerated Nodal Discontinuous Galerkin + * solver for Conservation Laws. + * + * Copyright (C) 2020-2022 Matteo Cicuttin - University of Liège + * + * This code is released under GNU AGPLv3 license, see LICENSE.txt for details. + */ + +#include <iostream> +#include <iomanip> + +#include <unistd.h> + +#include "test.h" +#include "sgr.hpp" +#include "timecounter.h" +#include "libgmshdg/gmsh_io.h" +#include "libgmshdg/entity_data.h" +#include "libgmshdg/kernels_cpu.h" +#include "libgmshdg/silo_output.hpp" +#include "libgmshdg/kernels_gpu.h" + +using namespace sgr; + +static void +make_geometry(int order, double mesh_h) +{ + gm::add("difftest"); + + std::vector<std::pair<int,int>> objects; + + objects.push_back( + std::make_pair(3, gmo::addBox(0.0, 0.0, 0.0, 1.0, 1.0, 0.5) ) + ); + objects.push_back( + std::make_pair(3, gmo::addBox(0.0, 0.0, 0.5, 0.5, 0.5, 0.5) ) + ); + + std::vector<std::pair<int, int>> tools; + gmsh::vectorpair odt; + std::vector<gmsh::vectorpair> odtm; + gmo::fragment(objects, tools, odt, odtm); + + gmo::synchronize(); + + gvp_t vp; + gm::getEntities(vp); + gmm::setSize(vp, mesh_h); +} + +int test_curl_convergence(int geometric_order, int approximation_order) +{ + std::vector<double> sizes({ 0.32, 0.16, 0.08, 0.04 }); + std::vector<double> errors; + + std::cout << std::endl << cyanfg << "Testing geometric order " << geometric_order; + std::cout << ", approximation order = " << approximation_order << nofg; + std::cout << std::endl; + + auto fx = [](const point_3d& pt) -> double { + return std::sin(M_PI*pt.y())*std::sin(M_PI*pt.z()); + }; + auto fy = [](const point_3d& pt) -> double { + return std::sin(M_PI*pt.z())*std::sin(M_PI*pt.x()); + }; + auto fz = [](const point_3d& pt) -> double { + return std::sin(M_PI*pt.x())*std::sin(M_PI*pt.y()); + }; + + auto cfx = [](const point_3d& pt) -> double { + return M_PI*std::sin(M_PI*pt.x())*std::cos(M_PI*pt.y()) - M_PI*std::cos(M_PI*pt.z())*std::sin(M_PI*pt.x()); + }; + auto cfy = [](const point_3d& pt) -> double { + return M_PI*std::sin(M_PI*pt.y())*std::cos(M_PI*pt.z()) - M_PI*std::cos(M_PI*pt.x())*std::sin(M_PI*pt.y()); + }; + auto cfz = [](const point_3d& pt) -> double { + return M_PI*std::sin(M_PI*pt.z())*std::cos(M_PI*pt.x()) - M_PI*std::cos(M_PI*pt.y())*std::sin(M_PI*pt.z()); + }; + + std::vector<double> errs_x; + std::vector<double> errs_y; + std::vector<double> errs_z; + + for (size_t i = 0; i < sizes.size(); i++) + { + double h = sizes[i]; + make_geometry(0,h); + gmm::generate( DIMENSION(3) ); + + model mod(geometric_order, approximation_order); + mod.build(); + +#ifdef WRITE_TEST_OUTPUTS + std::stringstream ss; + ss << "diff_go_" << geometric_order << "_ao_" << approximation_order; + ss << "_seq_" << i << ".silo"; + + silo silodb; + silodb.create_db(ss.str()); + silodb.import_mesh_from_gmsh(); + silodb.write_mesh(); +#endif + auto model_num_dofs = mod.num_dofs(); + + vecxd proj_fx = vecxd::Zero(model_num_dofs); + vecxd proj_fy = vecxd::Zero(model_num_dofs); + vecxd proj_fz = vecxd::Zero(model_num_dofs); + + vecxd proj_cfx = vecxd::Zero(model_num_dofs); + vecxd proj_cfy = vecxd::Zero(model_num_dofs); + vecxd proj_cfz = vecxd::Zero(model_num_dofs); + + std::vector<entity_data_gpu> edgs; + + size_t model_num_dofs_gpu = model_num_dofs; + + for (auto& e : mod) + { + + e.project(fx, proj_fx); + e.project(fy, proj_fy); + e.project(fz, proj_fz); + + e.project(cfx, proj_cfx); + e.project(cfy, proj_cfy); + e.project(cfz, proj_cfz); + + entity_data_cpu ed; + e.populate_entity_data(ed, mod); + entity_data_gpu edg(ed); + edgs.push_back( std::move(edg) ); + } + + /* Prepare I/O vectors and call kernel */ + + device_vector<double> proj_fx_gpu(proj_fx.data(), proj_fx.size()); + device_vector<double> proj_fy_gpu(proj_fy.data(), proj_fy.size()); + device_vector<double> proj_fz_gpu(proj_fz.data(), proj_fz.size()); + + device_vector<double> comp_cfx_gpu(model_num_dofs_gpu); + device_vector<double> comp_cfy_gpu(model_num_dofs_gpu); + device_vector<double> comp_cfz_gpu(model_num_dofs_gpu); + + for (auto& edg : edgs) + { + timecounter_gpu tc; + tc.tic(); + gpu_compute_field_curl(edg, proj_fx_gpu.data(), proj_fy_gpu.data(), proj_fz_gpu.data(), 1.0, + comp_cfx_gpu.data(), comp_cfy_gpu.data(), comp_cfz_gpu.data()); + double time = tc.toc(); + + auto num_cells = edg.num_all_elems; + if (geometric_order == 1) + { + std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: "; + double flops = (45*edg.num_bf+6)*edg.num_bf*num_cells; + std::cout << flops/(1e9*time) << " GFlops/s" << std::endl; + } + else + { + std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: "; + double flops = ((45*edg.num_bf+12)*edg.num_bf /* *edg.num_qp */ + 6*((2*edg.num_bf-1) + 1)*edg.num_bf)*num_cells; + std::cout << flops/(1e9*time) << " GFlops/s" << std::endl; + } + } + + vecxd comp_cfx = vecxd::Zero(model_num_dofs_gpu); + vecxd comp_cfy = vecxd::Zero(model_num_dofs_gpu); + vecxd comp_cfz = vecxd::Zero(model_num_dofs_gpu); + + comp_cfx_gpu.copyout( comp_cfx.data() ); + comp_cfy_gpu.copyout( comp_cfy.data() ); + comp_cfz_gpu.copyout( comp_cfz.data() ); + + double err_x = 0.0; + double err_y = 0.0; + double err_z = 0.0; + +#ifdef WRITE_TEST_OUTPUTS + auto model_num_cells = mod.num_cells(); + std::vector<double> var_cfz(model_num_cells); + std::vector<double> var_cfy(model_num_cells); + std::vector<double> var_cfz(model_num_cells); +#endif + + for (auto& e : mod) + { + for (size_t iT = 0; iT < e.num_cells(); iT++) + { + auto& pe = e.cell(iT); + auto& re = e.cell_refelem(pe); + matxd mass = e.mass_matrix(iT); + auto num_bf = re.num_basis_functions(); + auto ofs = e.cell_model_dof_offset(iT); + vecxd diff_x = proj_cfx.segment(ofs, num_bf) - comp_cfx.segment(ofs, num_bf); + vecxd diff_y = proj_cfy.segment(ofs, num_bf) - comp_cfy.segment(ofs, num_bf); + vecxd diff_z = proj_cfz.segment(ofs, num_bf) - comp_cfz.segment(ofs, num_bf); + + err_x += diff_x.dot(mass*diff_x); + err_y += diff_y.dot(mass*diff_y); + err_z += diff_z.dot(mass*diff_z); + +#ifdef WRITE_TEST_OUTPUTS + auto gi = e.cell_global_index_by_gmsh(iT); + assert(gi < model_num_cells); + vecxd phi_bar = re.basis_functions({1./3., 1./3., 1./3.}); + var_cfz[gi] = Cdf_dx.segment(ofs, num_bf).dot(phi_bar); + var_cfy[gi] = Cdf_dy.segment(ofs, num_bf).dot(phi_bar); + var_cfz[gi] = Cdf_dz.segment(ofs, num_bf).dot(phi_bar); +#endif + } + std::cout << "Errors: " << std::sqrt(err_x) << " " << std::sqrt(err_y); + std::cout << " " << std::sqrt(err_z) << std::endl; + } + +#ifdef WRITE_TEST_OUTPUTS + silodb.write_zonal_variable("df_dx", var_cfz); + silodb.write_zonal_variable("df_dy", var_cfy); + silodb.write_zonal_variable("df_dz", var_cfz); +#endif + + errs_x.push_back( std::sqrt(err_x) ); + errs_y.push_back( std::sqrt(err_y) ); + errs_z.push_back( std::sqrt(err_z) ); + + std::cout << std::endl; + } + + double rate_x = 0.0; + double rate_y = 0.0; + double rate_z = 0.0; + + std::cout << Byellowfg << "rate df/dx rate df/dy rate df/dz" << reset << std::endl; + for (size_t i = 1; i < sizes.size(); i++) + { + std::cout << (rate_x = std::log2(errs_x[i-1]/errs_x[i]) ) << " "; + std::cout << (rate_y = std::log2(errs_y[i-1]/errs_y[i]) ) << " "; + std::cout << (rate_z = std::log2(errs_z[i-1]/errs_z[i]) ) << std::endl; + } + + COMPARE_VALUES_ABSOLUTE("df/dx", rate_x, double(approximation_order), 0.2); + COMPARE_VALUES_ABSOLUTE("df/dy", rate_y, double(approximation_order), 0.2); + COMPARE_VALUES_ABSOLUTE("df/dz", rate_z, double(approximation_order), 0.2); + + return 0; +} + +int main(void) +{ + gmsh::initialize(); + gmsh::option::setNumber("General.Terminal", 0); + gmsh::option::setNumber("Mesh.Algorithm", 1); + gmsh::option::setNumber("Mesh.Algorithm3D", 1); + + + int failed_tests = 0; + + std::cout << Bmagentafg << " *** TESTING: CURL ***" << reset << std::endl; + for (size_t go = 1; go < 2; go++) + for (size_t ao = 1; ao < 7; ao++) + failed_tests += test_curl_convergence(go, ao); + + return failed_tests; +} diff --git a/tests/test_differentiation.cpp b/tests/test_differentiation.cpp index 8bec165caef2cc3c717fbc4224204f3a4d68a5cc..a9080c713935a4c0d35717c59c83254a302e3f62 100644 --- a/tests/test_differentiation.cpp +++ b/tests/test_differentiation.cpp @@ -52,7 +52,7 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde std::vector<double> sizes({ 0.32, 0.16, 0.08, 0.04 }); std::vector<double> errors; - std::cout << cyanfg << "Testing geometric order " << geometric_order; + std::cout << std::endl << cyanfg << "Testing geometric order " << geometric_order; std::cout << ", approximation order = " << approximation_order << nofg; std::cout << std::endl; @@ -125,13 +125,13 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde auto num_cells = num_elems_all_orientations(ed); if (geometric_order == 1) { - std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: "; + std::cout << num_cells << " elements. Kernel runtime: " << time << " seconds. Estimated performance: "; double flops = 21*ed.num_bf*ed.num_bf*num_cells; std::cout << flops/(1e9*time) << " GFlops/s" << std::endl; } else { - std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: "; + std::cout << num_cells << " elements. Kernel runtime: " << time << " seconds. Estimated performance: "; double flops = ((21*ed.num_bf+6)*ed.num_bf*ed.num_qp + 3*(2*ed.num_bf-1)*ed.num_bf)*num_cells; std::cout << flops/(1e9*time) << " GFlops/s" << std::endl; } @@ -188,6 +188,8 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde errs_x.push_back( std::sqrt(err_x) ); errs_y.push_back( std::sqrt(err_y) ); errs_z.push_back( std::sqrt(err_z) ); + + std::cout << std::endl; } double rate_x = 0.0; diff --git a/tests/test_differentiation_gpu.cpp b/tests/test_differentiation_gpu.cpp index f36cca30f4deeaf52a39bee3ed027da8249180c6..ecfd5210a81e5e082ad2a6bff49c4ee674cb2da0 100644 --- a/tests/test_differentiation_gpu.cpp +++ b/tests/test_differentiation_gpu.cpp @@ -102,7 +102,7 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde std::vector<entity_data_gpu> edgs; size_t model_num_dofs_gpu = model_num_dofs; - + for (auto& e : mod) { e.project(f, Pf);