maxwell_gpu.cpp

/* This is GMSH/DG, a GPU-Accelerated Nodal Discontinuous Galerkin
 * solver for Conservation Laws.
 *
 * Copyright (C) 2020-2022 Matteo Cicuttin - University of Liège
 * 
 * This code is released under GNU AGPLv3 license, see LICENSE.txt for details.
 */

#include <iostream>

#include "libgmshdg/gmsh_io.h"
#include "libgmshdg/kernels_cpu.h"
#include "libgmshdg/kernels_gpu.h"
#include "libgmshdg/silo_output.hpp"
#include "maxwell/maxwell_interface.h"
#include "maxwell/maxwell_common.h"
#include "timecounter.h"


namespace maxwell {

void
init_matparams(const model& mod, solver_state_gpu& state,
    const parameter_loader& mpl)
{
    state.matparams.num_dofs = mod.num_dofs();
    state.matparams.num_fluxes = mod.num_fluxes();
    vecxd inv_eps = vecxd::Zero( mod.num_dofs() );
    vecxd inv_mu = vecxd::Zero( mod.num_dofs() );
    vecxd sigma = vecxd::Zero( mod.num_dofs() );
    vecxd sigma_over_eps = vecxd::Zero( mod.num_dofs() );

    for (auto& e : mod)
    {
        auto etag = e.material_tag();
        for (size_t iT = 0; iT < e.num_cells(); iT++)
        {
            auto& pe = e.cell(iT);
            auto& re = e.cell_refelem(pe);
            auto bar = pe.barycenter();
            auto v_epsilon = mpl.epsilon(etag, bar);
            auto v_mu = mpl.mu(etag, bar);
            auto v_sigma = mpl.sigma(etag, bar);
            auto ofs = e.cell_model_dof_offset(iT);

            for (size_t iD = 0; iD < re.num_basis_functions(); iD++)
            {
                inv_eps(ofs+iD) = 1./v_epsilon;
                inv_mu(ofs+iD) = 1./v_mu;
                sigma(ofs+iD) = v_sigma;
                sigma_over_eps(ofs+iD) = v_sigma/v_epsilon;
            }
        }
    }

    state.matparams.inv_epsilon.copyin( inv_eps.data(), inv_eps.size() );
    state.matparams.inv_mu.copyin( inv_mu.data(), inv_mu.size() );
    state.matparams.sigma.copyin( sigma.data(), sigma.size() );
    state.matparams.sigma_over_epsilon.copyin( sigma_over_eps.data(), sigma_over_eps.size() );

    vecxd aE, bE, aH, bH;
    init_lax_milgram(mod, mpl, aE, bE, aH, bH);
    state.matparams.aE.copyin(aE.data(), aE.size());
    state.matparams.bE.copyin(bE.data(), bE.size());
    state.matparams.aH.copyin(aH.data(), aH.size());
    state.matparams.bH.copyin(bH.data(), bH.size());

    vecxd bcc;
    init_boundary_conditions(mod, mpl, bcc);
    state.matparams.bc_coeffs.copyin(bcc.data(), bcc.size());
}

static void
build_source_compression_tables(const model& mod, solver_state_gpu& state, const parameter_loader& mpl)
{
    auto& bds = mod.boundary_descriptors();
    size_t face_num_base = 0;
    for (auto& e : mod)
    {
        for (size_t iF = 0; iF < e.num_faces(); iF++)
        {
            auto& pf = e.face(iF);
            auto& rf = e.face_refelem(pf);
            auto num_fluxes = rf.num_basis_functions();

            auto gfnum = face_num_base + iF;
            auto bd = bds[gfnum];
            if (bd.type == face_type::INTERFACE)
            {
                switch ( mpl.interface_type(bd.material_tag()) )
                {
                    case interface_condition::E_FIELD:
                    case interface_condition::SURFACE_CURRENT:
                        for (size_t i = 0; i < num_fluxes; i++)
                            state.bndsrcs_decomp_table_cpu.push_back(gfnum*num_fluxes+i);
                        break;
                    
                    default:
                        break;
                }
            }

            if (bd.type == face_type::BOUNDARY)
            {
                switch ( mpl.boundary_type(bd.material_tag()) )
                {
                    case boundary_condition::PLANE_WAVE_E:
                        for (size_t i = 0; i < num_fluxes; i++)
                            state.bndsrcs_decomp_table_cpu.push_back(gfnum*num_fluxes+i);
                        break;
                    
                    default:
                        break;
                }
            }
        }
        face_num_base += e.num_faces();
    }

    state.bndsrcs_decomp_table.copyin( state.bndsrcs_decomp_table_cpu.data(),
        state.bndsrcs_decomp_table_cpu.size() );

    state.bndsrcs_cpu.resize( state.bndsrcs_decomp_table_cpu.size() );
    state.bndsrcs_buf.resize( state.bndsrcs_decomp_table_cpu.size() );
}

void init_from_model(const model& mod, solver_state_gpu& state, const maxwell::parameter_loader& mpl)
{
    state.emf_curr.resize( mod.num_dofs() );
    state.emf_next.resize( mod.num_dofs() );
    state.dx.resize( mod.num_dofs() );
    state.dy.resize( mod.num_dofs() );
    state.dz.resize( mod.num_dofs() );

    state.jumps.resize( mod.num_fluxes() );
    state.fluxes.resize( mod.num_fluxes() );

    if ( mpl.sim_timeIntegrator() == time_integrator_type::RK4 )
    {
        state.k1.resize( mod.num_dofs() );
        state.k2.resize( mod.num_dofs() );
        state.k3.resize( mod.num_dofs() );
        state.k4.resize( mod.num_dofs() );
    }
    
    state.tmp.resize( mod.num_dofs() );

    state.Jx_src.resize( mod.num_dofs() );
    state.Jy_src.resize( mod.num_dofs() );
    state.Jz_src.resize( mod.num_dofs() );
    state.Jx_src_gpu.resize( mod.num_dofs() );
    state.Jy_src_gpu.resize( mod.num_dofs() );
    state.Jz_src_gpu.resize( mod.num_dofs() );
    state.Jx_src_gpu_buf.resize( mod.num_dofs() );
    state.Jy_src_gpu_buf.resize( mod.num_dofs() );
    state.Jz_src_gpu_buf.resize( mod.num_dofs() );

    state.bndsrcs.resize( mod.num_fluxes() );
    state.bndsrcs_decomp_cpu.resize( mod.num_fluxes() );

    for (auto& e : mod)
    {
        entity_data_cpu ed;
        e.populate_entity_data(ed, mod);
        entity_data_gpu edg(ed);
        state.eds.push_back( std::move(ed) );
        state.edgs.push_back( std::move(edg) );
    }

    build_source_compression_tables(mod, state, mpl);

    state.curr_time = 0.0;
    state.curr_timestep = 0;
}

void
compress_bndsrc(const solver_state_gpu& state, const field& srcs, pinned_field& csrcs)
{
    for (size_t i = 0; i < csrcs.num_dofs; i++)
    {
        auto from_ofs = state.bndsrcs_decomp_table_cpu[i];
        csrcs.Ex[i] = srcs.Ex[from_ofs]; 
        csrcs.Ey[i] = srcs.Ey[from_ofs]; 
        csrcs.Ez[i] = srcs.Ez[from_ofs]; 
        csrcs.Hx[i] = srcs.Hx[from_ofs]; 
        csrcs.Hy[i] = srcs.Hy[from_ofs]; 
        csrcs.Hz[i] = srcs.Hz[from_ofs]; 
    }
}

static void
compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources)
{
    auto currp = curr.data();
    auto nextp = next.data();
    auto dxp = state.dx.data();
    auto dyp = state.dy.data();
    auto dzp = state.dz.data();

    for (const auto& edg : state.edgs)
    {
        gpu_compute_field_derivatives(edg, currp.Ex, dxp.Ex, dyp.Ex, dzp.Ex, 1.0, state.compute_stream);
        gpu_compute_field_derivatives(edg, currp.Ey, dxp.Ey, dyp.Ey, dzp.Ey, 1.0, state.compute_stream);
        gpu_compute_field_derivatives(edg, currp.Ez, dxp.Ez, dyp.Ez, dzp.Ez, 1.0, state.compute_stream);
        gpu_compute_field_derivatives(edg, currp.Hx, dxp.Hx, dyp.Hx, dzp.Hx, 1.0, state.compute_stream);
        gpu_compute_field_derivatives(edg, currp.Hy, dxp.Hy, dyp.Hy, dzp.Hy, 1.0, state.compute_stream);
        gpu_compute_field_derivatives(edg, currp.Hz, dxp.Hz, dyp.Hz, dzp.Hz, 1.0, state.compute_stream);
    }

    auto Jx = state.Jx_src_gpu.data();
    auto Jy = state.Jy_src_gpu.data();
    auto Jz = state.Jz_src_gpu.data();
    
    if (use_sources)
    {
        gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream);
    }
    else
    {
        gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream);
    }

    gpu_curl(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream);
    gpu_curl(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream);
    gpu_curl(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream);
}

static void
compute_curls_E(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
{
    auto currp = curr.data();
    auto nextp = next.data();
    auto dxp = state.dx.data();
    auto dyp = state.dy.data();
    auto dzp = state.dz.data();

    for (const auto& edg : state.edgs)
    {
        gpu_compute_field_derivatives(edg, currp.Ex, dxp.Ex, dyp.Ex, dzp.Ex, 1.0, state.compute_stream);
        gpu_compute_field_derivatives(edg, currp.Ey, dxp.Ey, dyp.Ey, dzp.Ey, 1.0, state.compute_stream);
        gpu_compute_field_derivatives(edg, currp.Ez, dxp.Ez, dyp.Ez, dzp.Ez, 1.0, state.compute_stream);
    }

    
    gpu_curl(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream);
    gpu_curl(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream);
    gpu_curl(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream);
}

static void
compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources)
{
    auto currp = curr.data();
    auto nextp = next.data();
    auto dxp = state.dx.data();
    auto dyp = state.dy.data();
    auto dzp = state.dz.data();

    for (const auto& edg : state.edgs)
    {
        gpu_compute_field_derivatives(edg, currp.Hx, dxp.Hx, dyp.Hx, dzp.Hx, 1.0, state.compute_stream);
        gpu_compute_field_derivatives(edg, currp.Hy, dxp.Hy, dyp.Hy, dzp.Hy, 1.0, state.compute_stream);
        gpu_compute_field_derivatives(edg, currp.Hz, dxp.Hz, dyp.Hz, dzp.Hz, 1.0, state.compute_stream);
    }

    if (use_sources)
    {
        auto Jx = state.Jx_src_gpu.data();
        auto Jy = state.Jy_src_gpu.data();
        auto Jz = state.Jz_src_gpu.data();
        gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream);
    }
    else
    {
        gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream);
    }

}

static void
compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
{
    auto matparams_ptrs = state.matparams.data();
    double *bc_coeffs = matparams_ptrs.bc_coeffs;

    for (const auto& edg : state.edgs)
        maxwell::gpu_compute_jumps(edg, in, state.jumps, bc_coeffs, state.compute_stream);

    for (const auto& edg : state.edgs)
        maxwell::gpu_compute_fluxes(edg, state.jumps, state.fluxes, state.bndsrcs,
            state.matparams, use_sources, state.compute_stream);

    auto fluxesp = state.fluxes.data();
    auto outp = out.data();

    for (auto& edg : state.edgs)
    {
        gpu_compute_flux_lifting(edg, fluxesp.Ex, outp.Ex, state.compute_stream);
        gpu_compute_flux_lifting(edg, fluxesp.Ey, outp.Ey, state.compute_stream);
        gpu_compute_flux_lifting(edg, fluxesp.Ez, outp.Ez, state.compute_stream);
        gpu_compute_flux_lifting(edg, fluxesp.Hx, outp.Hx, state.compute_stream);
        gpu_compute_flux_lifting(edg, fluxesp.Hy, outp.Hy, state.compute_stream);
        gpu_compute_flux_lifting(edg, fluxesp.Hz, outp.Hz, state.compute_stream);
    }
}

static void
compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
{
    auto matparams_ptrs = state.matparams.data();
    double *bc_coeffs = matparams_ptrs.bc_coeffs;

    for (const auto& edg : state.edgs)
        maxwell::gpu_compute_jumps_E(edg, in, state.jumps, bc_coeffs, state.compute_stream);

    for (const auto& edg : state.edgs)
        maxwell::gpu_compute_fluxes_E(edg, state.jumps, state.fluxes, state.bndsrcs,
            state.matparams, use_sources, state.compute_stream);

    auto fluxesp = state.fluxes.data();
    auto outp = out.data();

    for (auto& edg : state.edgs)
    {
        gpu_compute_flux_lifting(edg, fluxesp.Hx, outp.Hx, state.compute_stream);
        gpu_compute_flux_lifting(edg, fluxesp.Hy, outp.Hy, state.compute_stream);
        gpu_compute_flux_lifting(edg, fluxesp.Hz, outp.Hz, state.compute_stream);
    }
}

static void
compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
{
    auto matparams_ptrs = state.matparams.data();
    double *bc_coeffs = matparams_ptrs.bc_coeffs;

    for (const auto& edg : state.edgs)
        maxwell::gpu_compute_jumps_H(edg, in, state.jumps, bc_coeffs, state.compute_stream);

    for (const auto& edg : state.edgs)
        maxwell::gpu_compute_fluxes_H(edg, state.jumps, state.fluxes, state.bndsrcs,
            state.matparams, use_sources, state.compute_stream);

    auto fluxesp = state.fluxes.data();
    auto outp = out.data();

    for (auto& edg : state.edgs)
    {
        gpu_compute_flux_lifting(edg, fluxesp.Ex, outp.Ex, state.compute_stream);
        gpu_compute_flux_lifting(edg, fluxesp.Ey, outp.Ey, state.compute_stream);
        gpu_compute_flux_lifting(edg, fluxesp.Ez, outp.Ez, state.compute_stream);
    }
}

static void
leapfrog(solver_state_gpu& state, const parameter_loader& mpl)
{
    double *r = state.matparams.sigma_over_epsilon.data();
    double *eps = state.matparams.inv_epsilon.data();
    double *mu = state.matparams.inv_mu.data();

    auto currp = state.emf_curr.data();
    auto nextp = state.emf_next.data();
    auto tmpp = state.tmp.data();

    bool ve = mpl.volume_sources_enabled();
    compute_curls_H(state, state.emf_curr, state.tmp, ve);

    auto be = mpl.boundary_sources_enabled();
    auto ie = mpl.interface_sources_enabled();
    compute_fluxes_H(state, state.emf_curr, state.tmp, be or ie);
    
    auto dt = state.delta_t;
    gpu_compute_leapfrog_update(nextp.Ex, currp.Ex, r, tmpp.Ex, eps, nextp.num_dofs,
        dt, state.compute_stream);
    gpu_compute_leapfrog_update(nextp.Ey, currp.Ey, r, tmpp.Ey, eps, nextp.num_dofs,
        dt, state.compute_stream);
    gpu_compute_leapfrog_update(nextp.Ez, currp.Ez, r, tmpp.Ez, eps, nextp.num_dofs,
        dt, state.compute_stream);

    compute_curls_E(state, state.emf_next, state.tmp);
    compute_fluxes_E(state, state.emf_next, state.tmp, be or ie);

    gpu_compute_leapfrog_update(nextp.Hx, currp.Hx, tmpp.Hx, mu, nextp.num_dofs,
        dt, state.compute_stream);
    gpu_compute_leapfrog_update(nextp.Hy, currp.Hy, tmpp.Hy, mu, nextp.num_dofs,
        dt, state.compute_stream);
    gpu_compute_leapfrog_update(nextp.Hz, currp.Hz, tmpp.Hz, mu, nextp.num_dofs,
        dt, state.compute_stream);
}

static void
apply_operator(solver_state_gpu& state, const parameter_loader& mpl, const field_gpu& curr,
    field_gpu& next)
{
    bool ve = mpl.volume_sources_enabled();
    auto be = mpl.boundary_sources_enabled();
    auto ie = mpl.interface_sources_enabled();
    compute_curls(state, curr, next, ve);
    compute_fluxes(state, curr, next, be or ie);
}

static void
compute_euler_update(solver_state_gpu& state, const field_gpu& y,
    const field_gpu& k, double dt, field_gpu& out)
{
    double *r = state.matparams.sigma_over_epsilon.data();
    double *eps = state.matparams.inv_epsilon.data();
    double *mu = state.matparams.inv_mu.data();

    auto outp = out.data();
    auto yp = y.data();
    auto kp = k.data();

    gpu_compute_euler_update(outp.Ex, yp.Ex, kp.Ex, r, eps, out.num_dofs, dt, state.compute_stream);
    gpu_compute_euler_update(outp.Ey, yp.Ey, kp.Ey, r, eps, out.num_dofs, dt, state.compute_stream);
    gpu_compute_euler_update(outp.Ez, yp.Ez, kp.Ez, r, eps, out.num_dofs, dt, state.compute_stream);
    gpu_compute_euler_update(outp.Hx, yp.Hx, kp.Hx, mu, out.num_dofs, dt, state.compute_stream);
    gpu_compute_euler_update(outp.Hy, yp.Hy, kp.Hy, mu, out.num_dofs, dt, state.compute_stream);
    gpu_compute_euler_update(outp.Hz, yp.Hz, kp.Hz, mu, out.num_dofs, dt, state.compute_stream);
}

static void
compute_rk4_weighted_sum(solver_state_gpu& state, const field_gpu& in,
    double dt, field_gpu& out)
{
    double *r = state.matparams.sigma_over_epsilon.data();
    double *eps = state.matparams.inv_epsilon.data();
    double *mu = state.matparams.inv_mu.data();

    auto inp = in.data();
    auto outp = out.data();
    auto k1p = state.k1.data();
    auto k2p = state.k2.data();
    auto k3p = state.k3.data();
    auto k4p = state.k4.data();

    gpu_compute_rk4_weighted_sum(outp.Ex, inp.Ex, k1p.Ex, k2p.Ex, k3p.Ex, k4p.Ex, r, eps, out.num_dofs, dt, state.compute_stream);
    gpu_compute_rk4_weighted_sum(outp.Ey, inp.Ey, k1p.Ey, k2p.Ey, k3p.Ey, k4p.Ey, r, eps, out.num_dofs, dt, state.compute_stream);
    gpu_compute_rk4_weighted_sum(outp.Ez, inp.Ez, k1p.Ez, k2p.Ez, k3p.Ez, k4p.Ez, r, eps, out.num_dofs, dt, state.compute_stream);
    gpu_compute_rk4_weighted_sum(outp.Hx, inp.Hx, k1p.Hx, k2p.Hx, k3p.Hx, k4p.Hx, mu, out.num_dofs, dt, state.compute_stream);
    gpu_compute_rk4_weighted_sum(outp.Hy, inp.Hy, k1p.Hy, k2p.Hy, k3p.Hy, k4p.Hy, mu, out.num_dofs, dt, state.compute_stream);
    gpu_compute_rk4_weighted_sum(outp.Hz, inp.Hz, k1p.Hz, k2p.Hz, k3p.Hz, k4p.Hz, mu, out.num_dofs, dt, state.compute_stream);
}

void
timestep(const model& mod, solver_state_gpu& state, const parameter_loader& mpl, time_integrator_type ti)
{
    //timecounter tc;
    //tc.tic();

    if (ti == time_integrator_type::EULER)
    {
        apply_operator(state, mpl, state.emf_curr, state.tmp);
        compute_euler_update(state, state.emf_curr, state.tmp, state.delta_t, state.emf_next);
    }

    if (ti == time_integrator_type::RK4)
    {
        apply_operator(state, mpl, state.emf_curr, state.k1);

        compute_euler_update(state, state.emf_curr, state.k1, 0.5*state.delta_t, state.tmp);
        apply_operator(state, mpl, state.tmp, state.k2);

        compute_euler_update(state, state.emf_curr, state.k2, 0.5*state.delta_t, state.tmp);
        apply_operator(state, mpl, state.tmp, state.k3);
        
        compute_euler_update(state, state.emf_curr, state.k3, state.delta_t, state.tmp);
        apply_operator(state, mpl, state.tmp, state.k4);

        compute_rk4_weighted_sum(state, state.emf_curr, state.delta_t, state.emf_next);
    }

    if (ti == time_integrator_type::LEAPFROG)
    {
        leapfrog(state, mpl);
    }

    state.curr_time += state.delta_t;
    state.curr_timestep += 1;

    //double time = tc.toc();
    //double dofs_per_sec = 6*state.emf_curr.num_dofs/time;
    //std::cout << "Timestep " << state.curr_timestep << ", " << dofs_per_sec << " DOFs/s" << std::endl;
}

void
prepare_sources(const model& mod, maxwell::solver_state_gpu& state,
    maxwell::parameter_loader& mpl)
{
    if ( mpl.boundary_sources_enabled() )
        maxwell::eval_boundary_sources_new(mod, mpl, state, state.bndsrcs_decomp_cpu);

    if ( mpl.interface_sources_enabled() )
        maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_decomp_cpu);

    compress_bndsrc(state, state.bndsrcs_decomp_cpu, state.bndsrcs_cpu);
    state.bndsrcs_buf.copyin(state.bndsrcs_cpu, state.memcpy_stream);
    state.memcpy_stream.wait();
    decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs);
}

static void
copyin_volume_sources(const entity& e, solver_state_gpu& state)
{
    auto ofs = e.dof_base();
    auto num = e.num_dofs();

    state.Jx_src_gpu_buf.copyin( state.Jx_src.data(), ofs, num, state.memcpy_stream );
    state.Jy_src_gpu_buf.copyin( state.Jy_src.data(), ofs, num, state.memcpy_stream );
    state.Jz_src_gpu_buf.copyin( state.Jz_src.data(), ofs, num, state.memcpy_stream );
}

static void
eval_volume_sources(const model& mod, const parameter_loader& mpl, solver_state_gpu& state)
{
    for (auto& e : mod)
    {
        auto etag = e.material_tag();
        if ( not mpl.volume_has_source(etag) )
            continue;

        auto fJ = [&](const point_3d& pt) -> vec3d {
            return mpl.eval_volume_source(etag, pt, state.curr_time);
        };

        e.project(fJ, state.Jx_src.data(), state.Jy_src.data(), state.Jz_src.data());
        copyin_volume_sources(e, state);
    }
}

void
do_sources(const model& mod, maxwell::solver_state_gpu& state,
    maxwell::parameter_loader& mpl)
{
    if ( mpl.source_has_changed_state() )
    {
        state.Jx_src.zero();
        state.Jy_src.zero();
        state.Jz_src.zero();
        state.Jx_src_gpu.zero();
        state.Jy_src_gpu.zero();
        state.Jz_src_gpu.zero();
        state.Jx_src_gpu_buf.zero();
        state.Jy_src_gpu_buf.zero();
        state.Jz_src_gpu_buf.zero();
        state.bndsrcs_decomp_cpu.zero();
        state.bndsrcs_cpu.zero();
        mpl.source_was_cleared();
    }

    auto be = mpl.boundary_sources_enabled();
    auto ie = mpl.interface_sources_enabled();
    auto ve = mpl.volume_sources_enabled();


    if ( ve )
        eval_volume_sources(mod, mpl, state);

    if ( be )
        eval_boundary_sources_new(mod, mpl, state, state.bndsrcs_decomp_cpu);

    if ( ie )
        eval_interface_sources(mod, mpl, state, state.bndsrcs_decomp_cpu);

    if ( be or ie )
    {
        compress_bndsrc(state, state.bndsrcs_decomp_cpu, state.bndsrcs_cpu);
        state.bndsrcs_buf.copyin(state.bndsrcs_cpu, state.memcpy_stream);
    }
}

void
swap(solver_state_gpu& state, const parameter_loader& mpl)
{
    /* Wait for all async copies to finish */
    checkGPU( gpuDeviceSynchronize() );

    auto be = mpl.boundary_sources_enabled();
    auto ie = mpl.interface_sources_enabled();
    if ( be or ie )
        decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs);
    
    std::swap(state.Jx_src_gpu, state.Jx_src_gpu_buf);
    std::swap(state.Jy_src_gpu, state.Jy_src_gpu_buf);
    std::swap(state.Jz_src_gpu, state.Jz_src_gpu_buf);
    std::swap(state.emf_curr, state.emf_next);

    /* Wait for decompression to finish */
    checkGPU( gpuDeviceSynchronize() );
}

#ifdef USE_MPI
void
gather_field(const model& mod, const maxwell::field_gpu& in, maxwell::field_gpu& f,
    int root, MPI_Comm comm)
{}
#endif /* USE_MPI */

void
export_fields_to_silo(const model& mod, maxwell::solver_state_gpu& state,
    const maxwell::parameter_loader& mpl, std::string basename)
{
    /* This is supposed to be called after swap */
    if (basename == "")
        basename = "timestep";

    std::stringstream ss;
    ss << mpl.sim_name() << "/" << basename << "_" << state.curr_timestep << ".silo";

    silo sdb;
    sdb.create_db(ss.str());
    sdb.import_mesh_from_gmsh();
    sdb.write_mesh(state.curr_time, state.curr_timestep);

    maxwell::field f;
    f.resize( mod.num_dofs() );
    state.emf_curr.copyout(f);


    auto E_exportmode = mpl.postpro_fieldExportMode("E");
    export_vector_field(mod, sdb, f.Ex, f.Ey, f.Ez, "E", E_exportmode);

    auto H_exportmode = mpl.postpro_fieldExportMode("H");
    export_vector_field(mod, sdb, f.Hx, f.Hy, f.Hz, "H", H_exportmode);

    auto sigma = [&](int tag, const point_3d& pt) -> double { return mpl.sigma(tag, pt); };
    auto J_exportmode = mpl.postpro_fieldExportMode("J");
    export_vector_field(mod, sdb, f.Ex, f.Ey, f.Ez, sigma, "J", J_exportmode);
}

} // namespace maxwell