maxwell_solver.cpp

/* This is GMSH/DG, a GPU-Accelerated Nodal Discontinuous Galerkin
 * solver for Conservation Laws.
 *
 * Copyright (C) 2020-2022 Matteo Cicuttin - University of Liège
 * 
 * This code is released under GNU AGPLv3 license, see LICENSE.txt for details.
 */

#include <regex>

#ifdef DONT_USE_STD_FILESYSTEM
#include <sys/stat.h>
#include <sys/types.h>
#else
#include <filesystem>
#endif

#include <sys/time.h>
#include <sys/resource.h>

#ifdef USE_MPI
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Weverything"
#include <mpi.h>
#pragma clang diagnostic pop
#endif

#include <fstream>

#include "gmsh.h"
#include "libgmshdg/silo_output.hpp"
#include "libgmshdg/param_loader.h"
#include "maxwell/maxwell_interface.h"
#include "maxwell/maxwell_common.h"
#include "maxwell/maxwell_postpro.h"
#include "timecounter.h"

#include "sgr.hpp"
using namespace sgr;

template<typename State>
void initialize_solver(const model& mod, State& state, const maxwell::parameter_loader& mpl)
{
    maxwell::init_from_model(mod, state, mpl);
    
    if ( mpl.initial_Efield_defined() )
    {
        auto E = [&](const point_3d& pt) -> vec3d { return mpl.initial_Efield(pt); };
        maxwell::init_E_field(mod, state, E);
    }

    if ( mpl.initial_Hfield_defined() )
    {
        auto H = [&](const point_3d& pt) -> vec3d { return mpl.initial_Hfield(pt); };
        maxwell::init_H_field(mod, state, H);
    }

    init_matparams(mod, state, mpl);
}

#ifdef ENABLE_PROFILING

#ifdef ENABLE_GPU_SOLVER
    #define TYPE profiler::type::GPU
#else
    #define TYPE profiler::type::CPU
#endif /* ENABLE_GPU_SOLVER */

template<typename State>
void initialize_profiling(const model& mod, State& state, time_integrator_type ti) 
{
    auto num_entities = mod.num_entities();
    auto num_cells = mod.num_cells_world();
    auto num_all_faces = mod.num_faces();
    auto cell_dofs = num_dofs_3D(mod.approximation_order);
    auto cell_fluxes = num_dofs_2D(mod.approximation_order);
    auto num_all_dofs = mod.num_dofs();
    auto num_all_fluxes = mod.num_fluxes();
    auto num_all_orientations = mod.num_orientations();

    std::cout << "num_entities: " << num_entities << std::endl
        << "num_cells: " << num_cells << std::endl
        << "num_all_faces: " << num_all_faces << std::endl
        << "cell_dofs: " << cell_dofs << std::endl
        << "cell_fluxes: " << cell_fluxes << std::endl
        << "num_all_dofs: " << num_all_dofs << std::endl
        << "num_all_fluxes: " << num_all_fluxes << std::endl;

    state.logger.register_profiler("solver", TYPE)
        .set_quantity("DoFs", num_all_dofs*6);

    auto jacobians_read = 3*3*num_cells;
    auto dm_read = 3*cell_dofs*cell_dofs;
    auto curl_readwrite = 6*num_all_dofs;
    auto sources_application = 3*num_all_dofs;
    if (ti == time_integrator_type::LEAPFROG)
    {
#ifdef TM_NEW_CURLS
        state.logger.register_profiler("compute_curls_H", TYPE)
            .set_quantity("GFlops", ((45*cell_dofs+9)*num_all_dofs)/1e9)
            .set_quantity("GB", (jacobians_read+dm_read+curl_readwrite+sources_application)*8/1e9);
        state.logger.register_profiler("compute_curls_E", TYPE)
            .set_quantity("GFlops", ((45*cell_dofs+6)*num_all_dofs)/1e9)
            .set_quantity("GB", (jacobians_read+dm_read+curl_readwrite)*8/1e9);
#else
        auto derivatives = jacobians_read+dm_read+4*num_all_dofs;
        auto curls_h = 4*num_all_dofs;
        auto curls_e = 3*num_all_dofs;
        state.logger.register_profiler("compute_curls_H", TYPE)
            .set_quantity("GFlops", ((21*cell_dofs*num_all_dofs)*3 + 6*num_all_dofs)/1e9)
            .set_quantity("GB", (derivatives*3+curls_h)*8/1e9);
        state.logger.register_profiler("compute_curls_E", TYPE)
            .set_quantity("GFlops", ((21*cell_dofs*num_all_dofs)*3 + 3*num_all_dofs)/1e9)
            .set_quantity("GB",  (derivatives*3+curls_e)*8/1e9);
#endif /* TM_NEW_CURLS */

        state.logger.register_profiler("compute_field_jumps", TYPE)
            .set_quantity("GFlops", 3*num_all_fluxes/1e9)
            .set_quantity("GB", 9*num_all_fluxes*8/1e9);
 
        auto normals_read = num_all_faces;
        auto dets_read = num_all_faces;
        auto sources_imposition = 18*num_all_fluxes;
        auto matparams_readwrite = 4*num_all_fluxes;
        auto fluxes_write = 3*num_all_fluxes;
        state.logger.register_profiler("compute_fluxes_planar", TYPE)
            .set_quantity("GFlops", 37*num_all_fluxes/1e9)
            .set_quantity("GB", (normals_read+dets_read+sources_imposition+matparams_readwrite+fluxes_write)*8/1e9); 
        
        dets_read = num_cells;
        auto lm_read = num_all_orientations*cell_dofs*4*cell_fluxes;
        auto lifting_readwrite = num_all_fluxes+num_all_dofs;
        state.logger.register_profiler("compute_flux_lifting", TYPE)
            .set_quantity("GFlops", 21*cell_dofs*num_all_dofs*3/1e9)
            .set_quantity("GB", 3*(dets_read+lm_read+lifting_readwrite)*8/1e9);
    }
    else
    {
#ifdef TM_NEW_CURLS

        state.logger.register_profiler("compute_curls", TYPE)
            .set_quantity("GFlops", ((45*cell_dofs+6)*num_all_dofs + (45*cell_dofs+9)*num_all_dofs)/1e9)
            .set_quantity("GB", (2*(jacobians_read+dm_read+curl_readwrite)+sources_application)*8/1e9);
#else
        auto derivatives = jacobians_read+dm_read+4*num_all_dofs;
        auto curls = 4*num_all_dofs + 3*num_all_dofs;
        state.logger.register_profiler("compute_curls", TYPE)
            .set_quantity("GFlops", ((21*cell_dofs*num_all_dofs)*6 + 9*num_all_dofs)/1e9)
            .set_quantity("GB", (derivatives*6+curls)*8/1e9);
#endif /* TM_NEW_CURLS */

#ifdef USE_MPI
        state.logger.register_profiler("compute_field_jumps", TYPE)
            .set_quantity("GFlops", 6*num_all_fluxes/1e9)
            .set_quantity("GB", 18*num_all_fluxes*8/1e9);
 
        auto normals_read = num_all_faces;
        auto dets_read = num_all_faces;
        auto sources_imposition = 18*num_all_fluxes;
        auto matparams_readwrite = 8*num_all_fluxes;
        auto fluxes_write = 6*num_all_fluxes;
        state.logger.register_profiler("compute_fluxes_planar", TYPE)
            .set_quantity("GFlops", 68*num_all_fluxes/1e9)
            .set_quantity("GB", (normals_read+dets_read+sources_imposition+matparams_readwrite+fluxes_write)*8/1e9); 
#else
        auto normals_read = num_all_faces;
        auto dets_read = num_all_faces;
        auto jumps_and_sources = 24*num_all_fluxes;
        auto matparams_readwrite = 8*num_all_fluxes;
        auto fluxes_write = 6*num_all_fluxes;
        state.logger.register_profiler("compute_field_jumps_and_fluxes", TYPE)
            .set_quantity("GFlops", 72*num_all_fluxes/1e9)
            .set_quantity("GB", (normals_read+dets_read+jumps_and_sources+matparams_readwrite+fluxes_write)*8/1e9); 
#endif /* USE_MPI */
        
        dets_read = num_cells;
        auto lm_read = num_all_orientations*cell_dofs*4*cell_fluxes;
        auto lifting_readwrite = num_all_fluxes+num_all_dofs;
        state.logger.register_profiler("compute_flux_lifting", TYPE)
            .set_quantity("GFlops", 21*cell_dofs*num_all_dofs*6/1e9)
            .set_quantity("GB", 6*(dets_read+lm_read+lifting_readwrite)*8/1e9);
    }
}
#endif /* ENABLE_PROFILING */

#ifdef ENABLE_EXP_GEOMETRIC_DIODE
class integration_line
{
    std::vector<size_t>     element_tags;
    std::vector<point_3d>   sampling_points_phys;
    std::vector<point_3d>   sampling_points_ref;
    point_3d                sampling_increment;

public:
    integration_line(const point_3d& start,
        const point_3d& end, size_t samples)
    {
        sampling_increment = (end - start)/samples;
        for (size_t i = 0; i < samples; i++)
        {
            point_3d sp = start + sampling_increment*(i+0.5);
            sampling_points_phys.push_back(sp);

            size_t              etag;
            int                 etype;
            std::vector<size_t> ntags;
            double              u, v, w;
            gmsh::model::mesh::getElementByCoordinates(sp.x(), sp.y(), sp.z(),
                etag, etype, ntags, u, v, w, 3, true);
            
            element_tags.push_back(etag);
            sampling_points_ref.push_back( point_3d(u,v,w) );
        }
    }

    size_t sampling_points() const
    {
        assert(element_tags.size() == sampling_points_phys.size());
        assert(element_tags.size() == sampling_points_ref.size());
        return element_tags.size();
    }

    std::tuple<size_t, point_3d, point_3d>
    operator[](size_t pos)
    {
        assert(element_tags.size() == sampling_points_phys.size());
        assert(element_tags.size() == sampling_points_ref.size());
        assert(pos < element_tags.size());

        return std::make_tuple(
            element_tags[pos],
            sampling_points_phys[pos],
            sampling_points_ref[pos]
        );
    }

    point_3d increment() const
    {
        return sampling_increment;
    }

};

double
integrate_electric_field(const model& mod, const maxwell::solver_state& state,
    integration_line& iline)
{
    double integral_val = 0.0;
    auto si = iline.increment();

    for (size_t i = 0; i < iline.sampling_points(); i++)
    {
        auto [tag, physp, refp] = iline[i];
        auto [entnum, ofs] = mod.lookup_tag(tag);
        auto& e = mod.entity_at(entnum);
        auto& pe = e.cell(ofs);
        auto& re = e.cell_refelem(pe);
        auto dof_ofs = e.cell_model_dof_offset(ofs);
        auto dof_num = re.num_basis_functions();
        vecxd phi = re.basis_functions( refp );
        vecxd locEx = state.emf_curr.Ex.segment(dof_ofs, dof_num);
        vecxd locEy = state.emf_curr.Ey.segment(dof_ofs, dof_num);
        vecxd locEz = state.emf_curr.Ez.segment(dof_ofs, dof_num);
        integral_val += locEx.dot(phi) * si.x() +
                        locEy.dot(phi) * si.y() +
                        locEz.dot(phi) * si.z();
    }

    return integral_val;
}

#ifdef ENABLE_GPU_SOLVER
double
integrate(const model& mod, const maxwell::solver_state_gpu& state, integration_line& lint)
{
    return 0.0;
}
#endif

void
reinit_materials(const model& mod, maxwell::solver_state& state, maxwell::parameter_loader& mpl)
{
    for (auto& e : mod)
    {
        auto tag = e.gmsh_tag();
        for (size_t iT = 0; iT < e.num_cells(); iT++)
        {
            auto& pe = e.cell(iT);
            auto& re = e.cell_refelem(pe);
            auto bar = pe.barycenter();
            auto epsilon = mpl.epsilon(tag, bar);
            //auto mu = mpl.mu(tag, bar);
            auto sigma = mpl.sigma(tag, bar);
            auto ofs = e.cell_model_dof_offset(iT);

            for (size_t iD = 0; iD < re.num_basis_functions(); iD++)
            {
                //state.matparams.inv_epsilon(ofs+iD) = 1./epsilon;
                //state.matparams.inv_mu(ofs+iD) = 1./mu;
                state.matparams.sigma(ofs+iD) = sigma;
                state.matparams.sigma_over_epsilon(ofs+iD) = sigma/epsilon;
            }
        }
    }
}
#endif /* ENABLE_EXP_GEOMETRIC_DIODE */


/****************************************************************************/
/* Solver driver
 */
template<typename State>
void solver_mainloop(const model& mod, State& state, maxwell::parameter_loader& mpl)
{
#ifdef USE_MPI
    int proc_rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &proc_rank);
#endif /* USE_MPI */

    initialize_solver(mod, state, mpl);

    state.delta_t = mpl.sim_dt();
    auto num_timesteps = mpl.sim_timesteps();
    auto silo_output_rate = mpl.postpro_siloOutputRate();
    auto cycle_print_rate = mpl.postpro_cyclePrintRate();
    auto ti = mpl.sim_timeIntegrator();

#ifdef ENABLE_PROFILING
    initialize_profiling(mod, state, ti);
#endif /* ENABLE_PROFILING */

#ifdef USE_MPI
    if (proc_rank == 0) {
#endif /* USE_MPI */
    std::cout << "    BEGINNING SIMULATION" << std::endl;
    std::cout << "I will do " << num_timesteps << " timesteps of " << state.delta_t;
    std::cout << "s each." << std::endl;
    std::cout << "Time integrator: " << mpl.sim_timeIntegratorName() << std::endl;
#ifdef USE_MPI
    }
#endif /* USE_MPI */

#ifdef USE_MPI
    size_t my_num_dofs = 6*mod.num_dofs();
    size_t num_dofs;
    MPI_Reduce(&my_num_dofs, &num_dofs, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
    if (proc_rank == 0)
        std::cout << "Model has " << num_dofs << " DoFs" << std::endl;
#else /* USE_MPI */
    size_t num_dofs = 6*mod.num_dofs();
    std::cout << "Model has " << num_dofs << " DoFs" << std::endl;
#endif /* USE_MPI */


    mpl.call_initialization_callback();
#ifdef ENABLE_EXP_GEOMETRIC_DIODE
 #ifdef USE_MPI
  #error "The geometric diode code is not compatible with MPI yet."
 #endif
    auto relmat = [&](){ reinit_materials(mod, state, mpl); };

    sol::state& lua = mpl.lua_state();
    lua["relmat"] = relmat;
#endif /* ENABLE_EXP_GEOMETRIC_DIODE */

    double total_iteration_time = 0.;

    prepare_sources(mod, state, mpl);

#ifdef ENABLE_PROFILING
    auto& solver_prof = state.logger.get_profiler("solver");
#endif /* ENABLE_PROFILING */

    for(size_t i = 0; i < num_timesteps; i++)
    {
        timecounter_cpu tc;
        tc.tic();
        PROFILER_TIC(solver_prof);

        mpl.call_timestep_callback(i);
        timestep(mod, state, mpl, ti);
        do_sources(mod, state, mpl);
        swap(state, mpl);

        PROFILER_TOC(solver_prof);
        double time = tc.toc();
        total_iteration_time += time;

        if ( (silo_output_rate != 0) and (i%silo_output_rate == 0) )
            export_fields_to_silo(mod, state, mpl, "");

        if ( (cycle_print_rate != 0) and (i%cycle_print_rate == 0) )
        {
            double dofs_s_proc = (mod.num_dofs()*6)/time;
#ifdef USE_MPI
            double dofs_s_total;
            MPI_Reduce(&dofs_s_proc, &dofs_s_total, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
            if (proc_rank == 0)
            {
                std::cout << cr << clrline << "Cycle " << i << ": t = ";
                std::cout << state.curr_time << " s" << ", DOFs/s: ";
                std::cout << dofs_s_total << std::flush;
            }
#else /* USE_MPI */
                std::cout << cr << clrline << "Cycle " << i << ": t = ";
                std::cout << state.curr_time << " s" << ", DOFs/s: ";
                std::cout << dofs_s_proc << ", time: " << time << " s\n";
#ifdef ENABLE_PROFILING
                state.logger.print_current();
#endif /* ENABLE_PROFILING */        
#endif /* USE_MPI */
        }
    }

#ifdef USE_MPI
    if (proc_rank == 0)
#endif
        std::cout << std::endl;

    mpl.call_finalization_callback();

#ifdef USE_MPI
    if (proc_rank == 0)
#endif
    {
        std::cout << Bon << greenfg;
        std::cout << "Total iteration time = " << total_iteration_time << " s" << std::endl;
        std::cout << "Avg iteration time   = " << total_iteration_time/num_timesteps << " s" << std::endl;
#ifdef USE_MPI
        std::cout << "Avg dofs/s           = " << num_timesteps*(mod.num_dofs_world()*6)/total_iteration_time << reset << std::endl;
#else
        std::cout << "Avg dofs/s           = " << num_timesteps*(mod.num_dofs()*6)/total_iteration_time << reset << std::endl;
#ifdef ENABLE_PROFILING
        state.logger.print_average();
#endif /* ENABLE_PROFILING */
#endif
    }
}

/****************************************************************************/
/* Register Lua usertypes valid for both CPU and GPU
 */
static void
register_lua_usertypes(maxwell::parameter_loader& mpl)
{
    using namespace maxwell;

    sol::state& lua = mpl.lua_state();
    sol::usertype<field_values> field_values_ut =
        lua.new_usertype<field_values>("field_values",
            sol::constructors<field_values()>()
            );
    field_values_ut["Ex"] = &field_values::Ex;
    field_values_ut["Ey"] = &field_values::Ey;
    field_values_ut["Ez"] = &field_values::Ez;
    field_values_ut["Hx"] = &field_values::Hx;
    field_values_ut["Hy"] = &field_values::Hy;
    field_values_ut["Hz"] = &field_values::Hz;

#ifdef ENABLE_EXP_GEOMETRIC_DIODE
    sol::usertype<point_3d> point_3d_ut = lua.new_usertype<point_3d>("point",
        sol::constructors<point_3d(), point_3d(double, double, double)>()
        );

    sol::usertype<integration_line> line_integrator_ut = 
        lua.new_usertype<integration_line>("integration_line",
            sol::constructors<integration_line(const point_3d&, const point_3d&, size_t)>()
        );
#endif /* ENABLE_EXP_GEOMETRIC_DIODE */
}

/****************************************************************************/
/* Register GMSH API stuff
 */
static void
register_lua_callbacks(maxwell::parameter_loader& mpl, model& mod)
{
    sol::state& lua = mpl.lua_state();
    auto volume_physical_group_entities = [&](int tag) -> auto {
        return sol::as_table( mod.lookup_physical_group_entities(3, tag) );
    };
    lua["volume_physical_group_entities"] = volume_physical_group_entities;

    auto surface_physical_group_entities = [&](int tag) -> auto {
        return sol::as_table( mod.lookup_physical_group_entities(2, tag) );
    };
    lua["surface_physical_group_entities"] = surface_physical_group_entities;
}

/****************************************************************************/
/* Register Lua usertypes valid only for CPU
 */
static void
register_lua_usertypes_bystate(maxwell::parameter_loader& mpl,
    model& mod, maxwell::solver_state& state)
{
    sol::state& lua = mpl.lua_state();
    lua["internal"] = lua.create_table();
    lua["internal"]["model"] = &mod;
    lua["internal"]["state"] = &state;

    auto compute_energy_lambda = [&]() {
        return compute_energy(mod, state, mpl);
    };
    lua["compute_energy"] = compute_energy_lambda;

    auto compute_error_lambda = [&]() {
        return compute_error(mod, state, mpl);
    };
    lua["compute_error"] = compute_error_lambda;

    auto compare_at_gauss_points_lambda = [&]() {
        compare_at_gauss_points(mod, state, mpl);
    };
    lua["compare_at_gauss_points"] = compare_at_gauss_points_lambda;

#ifdef ENABLE_EXP_GEOMETRIC_DIODE
    lua["integrate_E"] = integrate_electric_field;
#endif /* ENABLE_EXP_GEOMETRIC_DIODE */
}

/****************************************************************************/
/* Register Lua usertypes valid only for GPU
 */
#ifdef ENABLE_GPU_SOLVER
static void
register_lua_usertypes_bystate(maxwell::parameter_loader& mpl,
    model& mod, maxwell::solver_state_gpu& state)
{}
#endif

#ifdef ENABLE_GPU_SOLVER
static void
select_gpu(void)
{
    int comm_rank = -1, comm_size = -1;
    MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &comm_size);

    MPI_Comm local_comm;
    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, comm_rank, MPI_INFO_NULL, &local_comm);

    int local_comm_rank = -1, local_comm_size = -1;
    MPI_Comm_rank(local_comm, &local_comm_rank);
    MPI_Comm_size(local_comm, &local_comm_size);

    printf("CR: %d, CS: %d, LCR: %d, LCS: %d\n", comm_rank, comm_size, local_comm_rank, local_comm_size);

    int num_devices = 0;
    gpuGetDeviceCount(&num_devices);

    if (num_devices < local_comm_size)
    {
        printf("Insufficient GPUs (%d detected, %d required)\n", num_devices, local_comm_size);
        MPI_Abort(MPI_COMM_WORLD, -1);
    }

    gpuSetDevice(local_comm_rank);
}
#endif /* ENABLE_GPU_SOLVER */


/****************************************************************************/
/* Main
 */
int main(int argc, char *argv[])
{
    struct rusage ru_start, ru_end;

    getrusage(RUSAGE_SELF, &ru_start);
#ifdef USE_MPI
    MPI_Init(&argc, &argv);

    int comm_rank, comm_size;
    MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
#endif

    if (argc != 2)
    {
        std::cout << "Please specify configuration file" << std::endl;
        return 1;
    }

    maxwell::parameter_loader mpl;
    register_lua_usertypes(mpl);

    if ( not mpl.load_file( argv[1] ) )
    {
        std::cout << "Configuration problem, exiting" << std::endl;
        return 1;
    }

#ifdef DONT_USE_STD_FILESYSTEM
    mkdir(mpl.sim_name().c_str(), 0755);
#else
    std::filesystem::create_directory( mpl.sim_name() );
#endif

#ifdef USE_MPI
    if (comm_rank == 0) {
#endif /* USE_MPI */
    gmsh::initialize();
    gmsh::option::setNumber("General.Terminal", 0);
    gmsh::open( mpl.sim_gmshmodel() );
    bool generate = std::regex_match(mpl.sim_gmshmodel(), std::regex(".*\\.geo$") );

    if (generate)
    {
        std::cout << "Generating mesh..." << std::flush;
        gmm::generate( DIMENSION(3) );
        std::cout << "done" << std::endl;
    }

    auto [scaled, sf] = mpl.mesh_scalefactor();
    if (scaled)
    {
        std::vector<double> tr = {sf,  0,  0,  0, 
                                   0, sf,  0,  0,
                                   0,  0, sf,  0 };
        gmm::affineTransform(tr);
    }

#ifdef USE_MPI
    }
#endif /* USE_MPI */

    model mod( mpl.sim_geomorder(), mpl.sim_approxorder() );


#ifdef USE_MPI
    if (comm_rank == 0)
    {        
        if (comm_size > 1)
            mod.partition_mesh(comm_size);
        std::cout << "Distributing mesh partitions..." << std::flush;
        mod.populate_from_gmsh();
        std::cout << "done" << std::endl;
    }
    else
    {
        mod.populate_from_gmsh();
    }
#else /* USE_MPI */
    mod.populate_from_gmsh();
#endif /* USE_MPI */
    
    register_lua_callbacks(mpl, mod);

    sol::state& lua = mpl.lua_state();
    auto on_mesh_loaded = lua["on_mesh_loaded"];
    if (on_mesh_loaded.valid())
        on_mesh_loaded();

    if ( not mpl.validate_model_params(mod) )
    {
        std::cout << "Configuration problem, exiting" << std::endl;
        return 1;
    }

    if ( mpl.sim_usegpu() )
    {
#ifdef ENABLE_GPU_SOLVER
#ifdef USE_MPI
        if (comm_size > 1)
        {
            if (comm_rank == 0)
            {
                std::cout << " *** MPI not supported yet on GPU. ***" << std::endl;
                std::cout << "Running on " << greenfg << "GPU " << nofg << "with ";
                std::cout << greenfg << comm_size << nofg << " MPI processes";
                std::cout << std::endl;
            }
            select_gpu();
        }
#endif
        std::cout << "Running on " << greenfg << "GPU" << nofg << std::endl;
        maxwell::solver_state_gpu state_g;
        register_lua_usertypes_bystate(mpl, mod, state_g);
        solver_mainloop(mod, state_g, mpl);
#else
        std::cout << "GPU solver not compiled. Exiting." << std::endl;
#endif /* ENABLE_GPU_SOLVER */
    }
    else
    {
#ifdef USE_MPI
        if (comm_rank == 0) {
            std::cout << "Running on " << yellowfg << "CPU " << nofg << "with ";
            std::cout << yellowfg << comm_size << nofg << " MPI processes";
            std::cout << std::endl;
        }
#else /* USE_MPI */
        std::cout << "Running on " << yellowfg << "CPU" << nofg << std::endl;
#endif /* USE_MPI */
        maxwell::solver_state state_c;
        register_lua_usertypes_bystate(mpl, mod, state_c);
        solver_mainloop(mod, state_c, mpl);
    }

    //gmsh::finalize();

#ifdef USE_MPI
    MPI_Finalize();
#endif

    getrusage(RUSAGE_SELF, &ru_end);

    std::cout << "Max RSS: " << (ru_end.ru_maxrss - ru_start.ru_maxrss) << std::endl;

    return 0;
}