From 8bbfd799e095897e88adabb06008f619b6acb46b Mon Sep 17 00:00:00 2001
From: Matteo Cicuttin <datafl4sh@toxicnet.eu>
Date: Thu, 16 Sep 2021 13:50:39 +0200
Subject: [PATCH] Added MPI support to leapfrog/cpu.

---
 doc/lua_api.md              |  15 +++-
 include/gmsh_io.h           |   5 ++
 include/maxwell_interface.h |   3 +
 src/gmsh_io.cpp             |   7 +-
 src/maxwell_common.cpp      |  20 ++---
 src/maxwell_cpu.cpp         | 149 ++++++++++++++++++++++++++++++++++--
 src/maxwell_interface.cpp   |  18 +++++
 src/maxwell_solver.cpp      | 146 +++++++++++++++++++++++++++++++++--
 src/param_loader.cpp        |  14 ++++
 9 files changed, 349 insertions(+), 28 deletions(-)

diff --git a/doc/lua_api.md b/doc/lua_api.md
index 409abc6..c1cf993 100644
--- a/doc/lua_api.md
+++ b/doc/lua_api.md
@@ -7,7 +7,7 @@ This file documents the Lua API available in the solver. API has a *general* par
 The *general* part has to do with configuration not related to a specific problem (i.e. timestep, geometry file), whereas the *problem-specific* part configures all the parameter that make sense only on a given problem (i.e. materials and sources). This separation is reflected also in how the configuration is handled internally.
 
 ## API version
-This document describes the API available on the version v0.1 of the solver.
+This document describes the API available on the version v0.3 of the solver.
 
 ## General interface
 
@@ -20,16 +20,21 @@ This document describes the API available on the version v0.1 of the solver.
 - `sim.use_gpu` (0/1): enable/disable GPU usage.
 - `sim.approx_order` (integer): method approximation order.
 - `sim.geom_order` (integer): geometry order. Only order 1 supported for now.
-- `sim.time_integrator` (string): `"rk4"`, `"leapfrog"` or `"euler"`. Euler is only for test purporses, as it is numerically instable. Leapfrog works on CPU for now, use it if you have dissipative materials.
+- `sim.time_integrator` (string): `"rk4"`, `"leapfrog"` or `"euler"`. Euler is only for test purporses, as it is numerically instable.
 
 ### Postprocessing general variables
 
-- `postpro.silo_output_rate` (integer):
+- `postpro.silo_output_rate` (integer): 
 - `postpro.cycle_print_rate` (integer):
 
 ### Mesh variables
 - `mesh.scalefactor` (real): apply a scaling factor to the mesh.
 
+### Parallel execution information (available only if MPI support is enabled)
+The parallel solver runs as separate MPI processes. As such, each process loads and executes its own instance of the configuration script. This means that you must beware of global variables or other shared state, because modifications are **not** reflected across processes. Even if the configuration script should in general not depend on the rank of the current process, the following variables are exposed:
+- `parallel.comm_rank` (integer): The MPI rank of the current solver process
+- `parallel.comm_size` (integer): The MPI communicator size
+
 ### Callable functions
 - `enable_boundary_sources()`: takes a boolean parameter that specifies if boundary sources should be enabled or not
 - `enable_interface_sources()`: takes a boolean parameter that specifies if the sources applied on internal interfaces should be enabled or not
@@ -137,6 +142,10 @@ None.
 - `internal.model`: opaque reference to the internal representation of the GMSH model. Valid only after the solver is fully initialized, i.e. inside the callback functions.
 - `internal.state`: opaque reference to the internal representation of the solver state. Valid only after the solver is fully initialized, i.e. inside the callback functions.
 
+### Debug and validation
+The solver has some facilities to allow validation and comparison with analytical solutions. Those facilities are accessed via the `debug` table. In the current implementation the `debug` table is undefined by default, so it must be initialized explicitly in the configuration with a statement `debug = {}`. Possible members of the `debug` table are:
+- `analytical_solution(tag, x, y, z, t)`: define the analytical solution of the problem. If defined, the numerical solution is compared with the analytical solution at each timestep in the whole domain. The function has to return 6 values, namely `Ex`, `Ey`, `Ez`, `Hx`, `Hy` and `Hz`.
+
 ### Postprocessing
 The export of the fields E, H and J to Silo is controlled using the appropriate sections in the `postpro` table.
 - `postpro["E"].silo_mode` controls the export of the electric field E. Possible values are `"none"` (no export), `"nodal"` (export as a Silo nodal variable) and `"zonal"` (export as a Silo zonal variable) 
diff --git a/include/gmsh_io.h b/include/gmsh_io.h
index 431e135..7c530e0 100644
--- a/include/gmsh_io.h
+++ b/include/gmsh_io.h
@@ -240,6 +240,11 @@ public:
     {
         return ipconn;
     }
+#else
+    size_t num_cells_world() const
+    {
+        return num_cells();
+    }
 
 #endif /* USE_MPI */
 
diff --git a/include/maxwell_interface.h b/include/maxwell_interface.h
index 46ec674..d11acab 100644
--- a/include/maxwell_interface.h
+++ b/include/maxwell_interface.h
@@ -74,6 +74,9 @@ public:
     vec3d   eval_boundary_source(int, const point_3d&, double) const;
     vec3d   eval_interface_source(int, const point_3d&, double) const;
 
+    bool    has_analytical_solution(void) const;
+    std::pair<vec3d, vec3d> eval_analytical_solution(int, const point_3d&, double t) const;
+
     boundary_condition  boundary_type(int tag) const;
     interface_condition interface_type(int tag) const;
     field_export_mode postpro_fieldExportMode(const std::string&) const;
diff --git a/src/gmsh_io.cpp b/src/gmsh_io.cpp
index 8753340..0e1bd75 100644
--- a/src/gmsh_io.cpp
+++ b/src/gmsh_io.cpp
@@ -557,9 +557,10 @@ model::make_comm_descriptors(void)
         return a.first < b;
     };
 
-    size_t flux_base = 0;
-    /* For each entity */
 
+    /* Sorry for this. Basically I need to have the faces I will
+     * find in the next loop in the exact same order on both sides
+     * of the IPC boundary. */
     struct lazy_hack {
         element_key             fk;
         size_t                  fbs;
@@ -569,6 +570,8 @@ model::make_comm_descriptors(void)
 
     std::vector<lazy_hack> lhs;
 
+    size_t flux_base = 0;
+    /* For each entity */
     for (auto& e : entities)
     {
         /* and for each face */
diff --git a/src/maxwell_common.cpp b/src/maxwell_common.cpp
index c8f83c0..4fc24be 100644
--- a/src/maxwell_common.cpp
+++ b/src/maxwell_common.cpp
@@ -5,7 +5,7 @@
 #define DIRICHLET       2.0
 #define ROBIN           1.0
 #ifdef USE_MPI
-#define INTERPROCESS    1.0
+#define INTERPROCESS    1.0 
 #endif /* USE_MPI */
 #define NEUMANN         0.0
 
@@ -327,9 +327,9 @@ void export_J_field_nodal(const model& mod, silo& sdb, const vecxd& Ex,
 void export_E_field_zonal(const model& mod, silo& sdb, const vecxd& Ex,
     const vecxd& Ey, const vecxd& Ez, const parameter_loader&)
 {
-    std::vector<double> out_Ex( mod.num_cells() );
-    std::vector<double> out_Ey( mod.num_cells() );
-    std::vector<double> out_Ez( mod.num_cells() );
+    std::vector<double> out_Ex( mod.num_cells_world() );
+    std::vector<double> out_Ey( mod.num_cells_world() );
+    std::vector<double> out_Ez( mod.num_cells_world() );
 
 #ifdef USE_MPI
     for (auto& etag : mod.world_entities_tags())
@@ -362,9 +362,9 @@ void export_E_field_zonal(const model& mod, silo& sdb, const vecxd& Ex,
 void export_H_field_zonal(const model& mod, silo& sdb, const vecxd& Hx,
     const vecxd& Hy, const vecxd& Hz, const parameter_loader&)
 {
-    std::vector<double> out_Hx( mod.num_cells() );
-    std::vector<double> out_Hy( mod.num_cells() );
-    std::vector<double> out_Hz( mod.num_cells() );
+    std::vector<double> out_Hx( mod.num_cells_world() );
+    std::vector<double> out_Hy( mod.num_cells_world() );
+    std::vector<double> out_Hz( mod.num_cells_world() );
 
 #ifdef USE_MPI
     for (auto& etag : mod.world_entities_tags())
@@ -397,9 +397,9 @@ void export_H_field_zonal(const model& mod, silo& sdb, const vecxd& Hx,
 void export_J_field_zonal(const model& mod, silo& sdb, const vecxd& Ex,
     const vecxd& Ey, const vecxd& Ez, const parameter_loader& mpl)
 {
-    std::vector<double> out_Jx( mod.num_cells() );
-    std::vector<double> out_Jy( mod.num_cells() );
-    std::vector<double> out_Jz( mod.num_cells() );
+    std::vector<double> out_Jx( mod.num_cells_world() );
+    std::vector<double> out_Jy( mod.num_cells_world() );
+    std::vector<double> out_Jz( mod.num_cells_world() );
 
 #ifdef USE_MPI
     for (auto& etag : mod.world_entities_tags())
diff --git a/src/maxwell_cpu.cpp b/src/maxwell_cpu.cpp
index d0d07a4..4c74a98 100644
--- a/src/maxwell_cpu.cpp
+++ b/src/maxwell_cpu.cpp
@@ -83,14 +83,15 @@ init_from_model(const model& mod, solver_state& state)
         state.eds.push_back( std::move(ed) );
     }
 
+#ifdef USE_MPI
     auto& cds = mod.comm_descriptors();
 
     size_t dofs = 0;
     for (auto& [tag, cd] : cds)
-        if (cd.dof_mapping.size() > dofs)
-            dofs = cd.dof_mapping.size();
+        dofs = std::max(cd.dof_mapping.size(), dofs);
 
     state.ipc.resize(dofs);
+#endif /* USE_MPI */
 
     state.curr_time = 0.0;
     state.curr_timestep = 0;
@@ -185,6 +186,7 @@ compute_field_jumps(solver_state& state, const field& in)
     }
 }
 
+#ifdef USE_MPI
 static void
 exchange_boundary_data(const model& mod, solver_state& state)
 {
@@ -234,6 +236,7 @@ exchange_boundary_data(const model& mod, solver_state& state)
         }
     }
 }
+#endif /* USE_MPI */
 
 static void
 compute_field_jumps_E(solver_state& state, const field& in)
@@ -266,6 +269,59 @@ compute_field_jumps_E(solver_state& state, const field& in)
     }
 }
 
+#ifdef USE_MPI
+static void
+exchange_boundary_data_E(const model& mod, solver_state& state)
+{
+    auto& cds = mod.comm_descriptors();
+    for (auto& [tag, cd] : cds)
+    {
+        size_t ndofs = cd.dof_mapping.size();
+        if (cd.rank_mine < cd.rank_other)
+        {
+            for (size_t i = 0; i < ndofs; i++)
+            {
+                auto srci = cd.dof_mapping[i];
+                state.ipc.Ex[i] = state.jumps.Ex[srci];
+                state.ipc.Ey[i] = state.jumps.Ey[srci];
+                state.ipc.Ez[i] = state.jumps.Ez[srci];
+            }
+            MPI_Send(state.ipc.Ex.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD);
+            MPI_Send(state.ipc.Ey.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD);
+            MPI_Send(state.ipc.Ez.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD);
+
+            MPI_Recv(state.ipc.Ex.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD, nullptr);
+            MPI_Recv(state.ipc.Ey.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD, nullptr);
+            MPI_Recv(state.ipc.Ez.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD, nullptr);
+            for (size_t i = 0; i < cd.dof_mapping.size(); i++)
+            {
+                auto dsti = cd.dof_mapping[i];
+                state.jumps.Ex[dsti] -= state.ipc.Ex[i];
+                state.jumps.Ey[dsti] -= state.ipc.Ey[i];
+                state.jumps.Ez[dsti] -= state.ipc.Ez[i];
+            }
+        }
+        else
+        {
+            MPI_Recv(state.ipc.Ex.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD, nullptr);
+            MPI_Recv(state.ipc.Ey.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD, nullptr);
+            MPI_Recv(state.ipc.Ez.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD, nullptr);
+            for (size_t i = 0; i < cd.dof_mapping.size(); i++)
+            {
+                auto dsti = cd.dof_mapping[i];
+                double tmp;
+                tmp = state.jumps.Ex[dsti]; state.jumps.Ex[dsti] -= state.ipc.Ex[i]; state.ipc.Ex[i] = tmp;
+                tmp = state.jumps.Ey[dsti]; state.jumps.Ey[dsti] -= state.ipc.Ey[i]; state.ipc.Ey[i] = tmp;
+                tmp = state.jumps.Ez[dsti]; state.jumps.Ez[dsti] -= state.ipc.Ez[i]; state.ipc.Ez[i] = tmp;
+            }
+            MPI_Send(state.ipc.Ex.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD);
+            MPI_Send(state.ipc.Ey.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD);
+            MPI_Send(state.ipc.Ez.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD);
+        }
+    }
+}
+#endif /* USE_MPI */
+
 static void
 compute_field_jumps_H(solver_state& state, const field& in)
 {
@@ -297,6 +353,59 @@ compute_field_jumps_H(solver_state& state, const field& in)
     }
 }
 
+#ifdef USE_MPI
+static void
+exchange_boundary_data_H(const model& mod, solver_state& state)
+{
+    auto& cds = mod.comm_descriptors();
+    for (auto& [tag, cd] : cds)
+    {
+        size_t ndofs = cd.dof_mapping.size();
+        if (cd.rank_mine < cd.rank_other)
+        {
+            for (size_t i = 0; i < ndofs; i++)
+            {
+                auto srci = cd.dof_mapping[i];
+                state.ipc.Hx[i] = state.jumps.Hx[srci];
+                state.ipc.Hy[i] = state.jumps.Hy[srci];
+                state.ipc.Hz[i] = state.jumps.Hz[srci];
+            }
+            MPI_Send(state.ipc.Hx.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD);
+            MPI_Send(state.ipc.Hy.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD);
+            MPI_Send(state.ipc.Hz.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD);
+            
+            MPI_Recv(state.ipc.Hx.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD, nullptr);
+            MPI_Recv(state.ipc.Hy.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD, nullptr);
+            MPI_Recv(state.ipc.Hz.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD, nullptr);
+            for (size_t i = 0; i < cd.dof_mapping.size(); i++)
+            {
+                auto dsti = cd.dof_mapping[i];
+                state.jumps.Hx[dsti] -= state.ipc.Hx[i];
+                state.jumps.Hy[dsti] -= state.ipc.Hy[i];
+                state.jumps.Hz[dsti] -= state.ipc.Hz[i];
+            }
+        }
+        else
+        {
+            MPI_Recv(state.ipc.Hx.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD, nullptr);
+            MPI_Recv(state.ipc.Hy.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD, nullptr);
+            MPI_Recv(state.ipc.Hz.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD, nullptr);
+            for (size_t i = 0; i < cd.dof_mapping.size(); i++)
+            {
+                auto dsti = cd.dof_mapping[i];
+                double tmp;
+                tmp = state.jumps.Hx[dsti]; state.jumps.Hx[dsti] -= state.ipc.Hx[i]; state.ipc.Hx[i] = tmp;
+                tmp = state.jumps.Hy[dsti]; state.jumps.Hy[dsti] -= state.ipc.Hy[i]; state.ipc.Hy[i] = tmp;
+                tmp = state.jumps.Hz[dsti]; state.jumps.Hz[dsti] -= state.ipc.Hz[i]; state.ipc.Hz[i] = tmp;
+            }
+            MPI_Send(state.ipc.Hx.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD);
+            MPI_Send(state.ipc.Hy.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD);
+            MPI_Send(state.ipc.Hz.data(), ndofs, MPI_DOUBLE, cd.rank_other, tag, MPI_COMM_WORLD);
+        }
+    }
+}
+#endif /* USE_MPI */
+
 static void
 compute_fluxes_planar(solver_state& state)
 {
@@ -452,9 +561,14 @@ compute_fluxes(const model& mod, solver_state& state, const field& in, field& ou
 }
 
 static void
-compute_fluxes_E(solver_state& state, const field& in, field& out)
+compute_fluxes_E(const model& mod, solver_state& state, const field& in, field& out)
 {
     compute_field_jumps_E(state, in);
+
+#ifdef USE_MPI
+    exchange_boundary_data_E(mod, state);
+#endif /* USE_MPI */
+    
     compute_fluxes_planar_E(state);
     
     for (auto& ed : state.eds)
@@ -466,9 +580,14 @@ compute_fluxes_E(solver_state& state, const field& in, field& out)
 }
 
 static void
-compute_fluxes_H(solver_state& state, const field& in, field& out)
+compute_fluxes_H(const model& mod, solver_state& state, const field& in, field& out)
 {
     compute_field_jumps_H(state, in);
+
+#ifdef USE_MPI
+    exchange_boundary_data_H(mod, state);
+#endif /* USE_MPI */
+
     compute_fluxes_planar_H(state);
     
     for (auto& ed : state.eds)
@@ -529,11 +648,11 @@ apply_operator(const model& mod, solver_state& state, const field& curr, field&
 }
 
 static void
-leapfrog(solver_state& state)
+leapfrog(const model& mod, solver_state& state)
 {
     auto dt = state.delta_t;
     compute_curls_H(state, state.emf_curr, state.tmp);
-    compute_fluxes_H(state, state.emf_curr, state.tmp);
+    compute_fluxes_H(mod, state, state.emf_curr, state.tmp);
     for (size_t i = 0; i < state.emf_next.num_dofs; i++)
     {
         auto CRM = 1.0 - 0.5*dt*state.matparams.sigma_over_epsilon[i];
@@ -546,7 +665,7 @@ leapfrog(solver_state& state)
     }
 
     compute_curls_E(state, state.emf_next, state.tmp);
-    compute_fluxes_E(state, state.emf_next, state.tmp);
+    compute_fluxes_E(mod, state, state.emf_next, state.tmp);
     for (size_t i = 0; i < state.emf_next.num_dofs; i++)
     {
         auto CC = dt*state.matparams.inv_mu[i];
@@ -583,7 +702,7 @@ timestep(const model& mod, solver_state& state, const parameter_loader&, time_in
 
     if (ti == time_integrator_type::LEAPFROG)
     {
-        leapfrog(state);
+        leapfrog(mod, state);
     }
 
     state.curr_time += state.delta_t;
@@ -751,6 +870,20 @@ export_fields_to_silo(const model& mod, maxwell::solver_state& state,
                 export_J_field_zonal(mod, sdb, f.Ex, f.Ey, f.Ez, mpl);
                 break;
         }
+
+        std::vector<double> entranks( mod.num_cells_world(), 999999999 );
+        for (auto& etag : mod.world_entities_tags())
+        {
+            auto& e = mod.entity_world_lookup(etag);
+            int er = mod.entity_rank(etag);
+
+            for (size_t iT = 0; iT < e.num_cells(); iT++)
+            {
+                auto gi = e.cell_world_index_by_gmsh(iT);
+                entranks[gi] = er;
+            }
+        }
+        sdb.write_zonal_variable("entity_ranks", entranks);
     }
     else
     {
diff --git a/src/maxwell_interface.cpp b/src/maxwell_interface.cpp
index 9747e74..9bbaa66 100644
--- a/src/maxwell_interface.cpp
+++ b/src/maxwell_interface.cpp
@@ -688,4 +688,22 @@ parameter_loader::postpro_fieldExportMode(const std::string& field) const
     return field_export_mode::NONE;
 }
 
+bool parameter_loader::has_analytical_solution(void) const
+{
+    auto asol_fun = lua["debug"]["analytical_solution"];
+    if (asol_fun.valid())
+        return true;
+    
+    return false;
+}
+
+std::pair<vec3d, vec3d>
+parameter_loader::eval_analytical_solution(int tag, const point_3d& pt, double t) const
+{
+    vec3d E, H;
+    sol::tie(E(0), E(1), E(2), H(0), H(1), H(2)) =
+        lua["debug"]["analytical_solution"](tag, pt.x(), pt.y(), pt.z(), t);
+    return std::make_pair(E,H);
+}
+
 } // namespace maxwell
diff --git a/src/maxwell_solver.cpp b/src/maxwell_solver.cpp
index 27fab66..6cd728b 100644
--- a/src/maxwell_solver.cpp
+++ b/src/maxwell_solver.cpp
@@ -193,9 +193,114 @@ reinit_materials(const model& mod, maxwell::solver_state& state, maxwell::parame
 }
 #endif /* ENABLE_EXP_GEOMETRIC_DIODE */
 
+struct maxwell_6ple {
+    double Ex;
+    double Ey;
+    double Ez;
+    double Hx;
+    double Hy;
+    double Hz;
+
+    maxwell_6ple()
+        : Ex(0.0), Ey(0.0), Ez(0.0),
+          Hx(0.0), Hy(0.0), Hz(0.0)
+    {}
+};
+
+std::ostream&
+operator<<(std::ostream& os, const maxwell_6ple& m)
+{
+    os << m.Ex << " " << m.Ey << " " << m.Ez << " ";
+    os << m.Hx << " " << m.Hy << " " << m.Hz;
+    return os;
+}
+
+maxwell_6ple sqrt(const maxwell_6ple& m)
+{
+    maxwell_6ple ret;
+    ret.Ex = std::sqrt(m.Ex);
+    ret.Ey = std::sqrt(m.Ey);
+    ret.Ez = std::sqrt(m.Ez);
+    ret.Hx = std::sqrt(m.Hx);
+    ret.Hy = std::sqrt(m.Hy);
+    ret.Hz = std::sqrt(m.Hz);
+    return ret;
+}
+
+static void
+validate(const model& mod, maxwell::solver_state& state, maxwell::parameter_loader& mpl, std::ofstream& ofs)
+{
+    maxwell_6ple err;
+    maxwell_6ple energy;
+
+    for (auto& e : mod)
+    {
+        for (size_t iT = 0; iT < e.num_cells(); iT++)
+        {
+            const auto& pe = e.cell(iT);
+            const auto& re = e.cell_refelem(pe);
+
+            const auto pqps = pe.integration_points();
+            const auto dets = pe.determinants();
+
+            const auto ofs = e.cell_world_dof_offset(iT);
+            const auto cbs = re.num_basis_functions();
+
+            assert(pqps.size() == dets.size());
+            const auto bar = pe.barycenter();
+            auto eps = mpl.epsilon( e.material_tag(), bar );
+            auto mu = mpl.mu( e.material_tag(), bar );
+
+            for (size_t iQp = 0; iQp < pqps.size(); iQp++)
+            {
+                const auto& pqp = pqps[iQp];
+                const vecxd phi = re.basis_functions(iQp);
+                auto [E, H] = mpl.eval_analytical_solution(e.material_tag(), pqp.point(), state.curr_time);
+
+                maxwell_6ple Fh;
+                Fh.Ex = state.emf_next.Ex.segment(ofs, cbs).dot(phi);
+                Fh.Ey = state.emf_next.Ey.segment(ofs, cbs).dot(phi);
+                Fh.Ez = state.emf_next.Ez.segment(ofs, cbs).dot(phi);
+                Fh.Hx = state.emf_next.Hx.segment(ofs, cbs).dot(phi);
+                Fh.Hy = state.emf_next.Hy.segment(ofs, cbs).dot(phi);
+                Fh.Hz = state.emf_next.Hz.segment(ofs, cbs).dot(phi);
+
+                double ae;
+                ae = E(0) - Fh.Ex; err.Ex += pqp.weight() * ae * ae;
+                ae = E(1) - Fh.Ey; err.Ey += pqp.weight() * ae * ae;
+                ae = E(2) - Fh.Ez; err.Ez += pqp.weight() * ae * ae;
+                ae = H(0) - Fh.Hx; err.Hx += pqp.weight() * ae * ae;
+                ae = H(1) - Fh.Hy; err.Hy += pqp.weight() * ae * ae;
+                ae = H(2) - Fh.Hz; err.Hz += pqp.weight() * ae * ae;
+
+                energy.Ex += 0.5*eps*(Fh.Ex*Fh.Ex)*pqp.weight();
+                energy.Ey += 0.5*eps*(Fh.Ey*Fh.Ey)*pqp.weight();
+                energy.Ez += 0.5*eps*(Fh.Ez*Fh.Ez)*pqp.weight();
+                energy.Hx += 0.5*mu*(Fh.Hx*Fh.Hx)*pqp.weight();
+                energy.Hy += 0.5*mu*(Fh.Hy*Fh.Hy)*pqp.weight();
+                energy.Hz += 0.5*mu*(Fh.Hz*Fh.Hz)*pqp.weight();
+            }
+        }
+    }
+
+    ofs << energy.Ex << " " << energy.Ey << " " << energy.Ez << " " << energy.Hx;
+    ofs << " " << energy.Hy << " " << energy.Hz << " ";
+    ofs << energy.Ex + energy.Ey + energy.Ez + energy.Hx + energy.Hy + energy.Hz;// << std::endl;
+    ofs << " " << sqrt(err) << std::endl;
+}
+
+//static void
+//validate(const model& mod, maxwell::solver_state_gpu& state, maxwell::parameter_loader& mpl)
+//{}
+
 template<typename State>
-void test_it(const model& mod, State& state, maxwell::parameter_loader& mpl)
+void solver_mainloop(const model& mod, State& state, maxwell::parameter_loader& mpl)
 {
+#ifdef USE_MPI
+    int proc_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &proc_rank);
+#endif /* USE_MPI */
+
     initialize_solver(mod, state, mpl);
 
     state.delta_t = mpl.sim_dt();
@@ -204,10 +309,21 @@ void test_it(const model& mod, State& state, maxwell::parameter_loader& mpl)
     auto cycle_print_rate = mpl.postpro_cyclePrintRate();
     auto ti = mpl.sim_timeIntegrator();
 
+#ifdef USE_MPI
+    if (proc_rank == 0) {
+#endif /* USE_MPI */
     std::cout << "    BEGINNING SIMULATION" << std::endl;
     std::cout << "I will do " << num_timesteps << " of " << state.delta_t;
     std::cout << "s each." << std::endl;
     std::cout << "Time integrator: " << mpl.sim_timeIntegratorName() << std::endl;
+#ifdef USE_MPI
+    }
+#endif /* USE_MPI */
+
+    std::ofstream ofs;
+    bool validate_solution = mpl.has_analytical_solution();
+    if (validate_solution)
+        ofs.open("energy.txt");
 
     mpl.call_initialization_callback();
 #ifdef ENABLE_EXP_GEOMETRIC_DIODE
@@ -228,13 +344,27 @@ void test_it(const model& mod, State& state, maxwell::parameter_loader& mpl)
         if ( (silo_output_rate != 0) and (i%silo_output_rate == 0) )
             export_fields_to_silo(mod, state, mpl);
 
+        if ( validate_solution )
+            validate(mod, state, mpl, ofs);
+
         swap(state, mpl);
 
         if ( (cycle_print_rate != 0) and (i%cycle_print_rate == 0) )
         {
             double time = tc.toc();
+            double dofs_s_proc = (mod.num_dofs()*6*cycle_print_rate)/(time);
+#ifdef USE_MPI
+            double dofs_s_total;
+            MPI_Reduce(&dofs_s_proc, &dofs_s_total, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+            if (proc_rank == 0)
+            {
+                std::cout << "Cycle " << i << ": t = " << state.curr_time << " s";
+                std::cout << ", DOFs/s: " << dofs_s_total << std::endl;
+            }
+#else /* USE_MPI */
             std::cout << "Cycle " << i << ": t = " << state.curr_time << " s";
-            std::cout << ", DOFs/s: " << (mod.num_dofs()*6*cycle_print_rate)/(time) << std::endl;
+            std::cout << ", DOFs/s: " << dofs_s_proc << std::endl;
+#endif /* USE_MPI */
             tc.tic();
         }
     }
@@ -306,12 +436,18 @@ int main(int argc, char *argv[])
 #ifdef DONT_USE_STD_FILESYSTEM
     mkdir(mpl.sim_name().c_str(), 0755);
 #else
-    std::filesystem::create_directory( mpl.sim_name() );;
+    std::filesystem::create_directory( mpl.sim_name() );
 #endif
 
+#ifdef USE_MPI
+    if (comm_rank == 0) {
+#endif /* USE_MPI */
     gmsh::initialize();
     gmsh::option::setNumber("General.Terminal", 0);
     gmsh::open( mpl.sim_gmshmodel() );
+#ifdef USE_MPI
+    }
+#endif /* USE_MPI */
 
     model mod( mpl.sim_geomorder(), mpl.sim_approxorder() );
 
@@ -349,7 +485,7 @@ int main(int argc, char *argv[])
 #ifdef ENABLE_GPU_SOLVER
         maxwell::solver_state_gpu state_g;
         register_lua_usertypes_bystate(mpl, mod, state_g);
-        test_it(mod, state_g, mpl);
+        solver_mainloop(mod, state_g, mpl);
 #else
         std::cout << "GPU solver not compiled. Exiting." << std::endl;
 #endif /* ENABLE_GPU_SOLVER */
@@ -358,7 +494,7 @@ int main(int argc, char *argv[])
     {
         maxwell::solver_state state_c;
         register_lua_usertypes_bystate(mpl, mod, state_c);
-        test_it(mod, state_c, mpl);
+        solver_mainloop(mod, state_c, mpl);
     }
 
     //gmsh::finalize();
diff --git a/src/param_loader.cpp b/src/param_loader.cpp
index eb9b971..deea766 100644
--- a/src/param_loader.cpp
+++ b/src/param_loader.cpp
@@ -1,4 +1,9 @@
 #include <iostream>
+
+#ifdef USE_MPI
+#include <mpi.h>
+#endif /* USE_MPI */
+
 #include "param_loader.h"
 
 parameter_loader_base::parameter_loader_base()
@@ -44,6 +49,15 @@ parameter_loader_base::init(void)
     lua.set_function("enable_volume_sources",
                      &parameter_loader_base::enable_volume_sources,
                      this);
+
+#ifdef USE_MPI
+    int comm_rank, comm_size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
+    lua["parallel"] = lua.create_table();
+    lua["parallel"]["comm_rank"] = comm_rank;
+    lua["parallel"]["comm_size"] = comm_size;
+#endif /* USE_MPI */
 }
 
 bool
-- 
GitLab