Skip to content
Snippets Groups Projects
Select Git revision
  • 9cba41a382d10f5a5b2bc8395b7e0fb40259d5a7
  • master default protected
  • comm_opt
  • clem_dev
  • clem_dev_corrected
  • kokkos
  • devel
  • bcast_debug
  • MC/high_order_geometry
  • MC/mpi_nonblocking
  • MC/multigpu
  • MC/lifting_oneshot
  • MC/physent
  • curls_marco
  • MC/gefpg_lua_binding
  • emdant/dg-cherry-pick-8f1f09f5
  • v0.3.0
  • v0.2.0
  • v0.1.0
19 results

maxwell_gpu.cpp

Blame
  • maxwell_gpu.cpp 22.78 KiB
    /* This is GMSH/DG, a GPU-Accelerated Nodal Discontinuous Galerkin
     * solver for Conservation Laws.
     *
     * Copyright (C) 2020-2022 Matteo Cicuttin - University of Liège
     * 
     * This code is released under GNU AGPLv3 license, see LICENSE.txt for details.
     */
    
    #include <iostream>
    
    #include "libgmshdg/gmsh_io.h"
    #include "libgmshdg/kernels_cpu.h"
    #include "libgmshdg/kernels_gpu.h"
    #include "libgmshdg/silo_output.hpp"
    #include "maxwell/maxwell_interface.h"
    #include "maxwell/maxwell_common.h"
    #include "timecounter.h"
    
    
    namespace maxwell {
    
    void
    init_matparams(const model& mod, solver_state_gpu& state,
        const parameter_loader& mpl)
    {
        state.matparams.num_dofs = mod.num_dofs();
        state.matparams.num_fluxes = mod.num_fluxes();
        vecxd inv_eps = vecxd::Zero( mod.num_dofs() );
        vecxd inv_mu = vecxd::Zero( mod.num_dofs() );
        vecxd sigma = vecxd::Zero( mod.num_dofs() );
        vecxd sigma_over_eps = vecxd::Zero( mod.num_dofs() );
    
        for (auto& e : mod)
        {
            auto etag = e.material_tag();
            for (size_t iT = 0; iT < e.num_cells(); iT++)
            {
                auto& pe = e.cell(iT);
                auto& re = e.cell_refelem(pe);
                auto bar = pe.barycenter();
                auto v_epsilon = mpl.epsilon(etag, bar);
                auto v_mu = mpl.mu(etag, bar);
                auto v_sigma = mpl.sigma(etag, bar);
                auto ofs = e.cell_model_dof_offset(iT);
    
                for (size_t iD = 0; iD < re.num_basis_functions(); iD++)
                {
                    inv_eps(ofs+iD) = 1./v_epsilon;
                    inv_mu(ofs+iD) = 1./v_mu;
                    sigma(ofs+iD) = v_sigma;
                    sigma_over_eps(ofs+iD) = v_sigma/v_epsilon;
                }
            }
        }
    
        state.matparams.inv_epsilon.copyin( inv_eps.data(), inv_eps.size() );
        state.matparams.inv_mu.copyin( inv_mu.data(), inv_mu.size() );
        state.matparams.sigma.copyin( sigma.data(), sigma.size() );
        state.matparams.sigma_over_epsilon.copyin( sigma_over_eps.data(), sigma_over_eps.size() );
    
        vecxd aE, bE, aH, bH;
        init_lax_milgram(mod, mpl, aE, bE, aH, bH);
        state.matparams.aE.copyin(aE.data(), aE.size());
        state.matparams.bE.copyin(bE.data(), bE.size());
        state.matparams.aH.copyin(aH.data(), aH.size());
        state.matparams.bH.copyin(bH.data(), bH.size());
    
        vecxd bcc;
        init_boundary_conditions(mod, mpl, bcc);
        state.matparams.bc_coeffs.copyin(bcc.data(), bcc.size());
    }
    
    static void
    build_source_compression_tables(const model& mod, solver_state_gpu& state, const parameter_loader& mpl)
    {
        auto& bds = mod.boundary_descriptors();
        size_t face_num_base = 0;
        for (auto& e : mod)
        {
            for (size_t iF = 0; iF < e.num_faces(); iF++)
            {
                auto& pf = e.face(iF);
                auto& rf = e.face_refelem(pf);
                auto num_fluxes = rf.num_basis_functions();
    
                auto gfnum = face_num_base + iF;
                auto bd = bds[gfnum];
                if (bd.type == face_type::INTERFACE)
                {
                    switch ( mpl.interface_type(bd.material_tag()) )
                    {
                        case interface_condition::E_FIELD:
                        case interface_condition::SURFACE_CURRENT:
                            for (size_t i = 0; i < num_fluxes; i++)
                                state.bndsrcs_decomp_table_cpu.push_back(gfnum*num_fluxes+i);
                            break;
                        
                        default:
                            break;
                    }
                }
    
                if (bd.type == face_type::BOUNDARY)
                {
                    switch ( mpl.boundary_type(bd.material_tag()) )
                    {
                        case boundary_condition::PLANE_WAVE_E:
                            for (size_t i = 0; i < num_fluxes; i++)
                                state.bndsrcs_decomp_table_cpu.push_back(gfnum*num_fluxes+i);
                            break;
                        
                        default:
                            break;
                    }
                }
            }
            face_num_base += e.num_faces();
        }
    
        state.bndsrcs_decomp_table.copyin( state.bndsrcs_decomp_table_cpu.data(),
            state.bndsrcs_decomp_table_cpu.size() );
    
        state.bndsrcs_cpu.resize( state.bndsrcs_decomp_table_cpu.size() );
        state.bndsrcs_buf.resize( state.bndsrcs_decomp_table_cpu.size() );
    }
    
    void init_from_model(const model& mod, solver_state_gpu& state, const maxwell::parameter_loader& mpl)
    {
        state.emf_curr.resize( mod.num_dofs() );
        state.emf_next.resize( mod.num_dofs() );
        state.dx.resize( mod.num_dofs() );
        state.dy.resize( mod.num_dofs() );
        state.dz.resize( mod.num_dofs() );
    
        state.jumps.resize( mod.num_fluxes() );
        state.fluxes.resize( mod.num_fluxes() );
    
        if ( mpl.sim_timeIntegrator() == time_integrator_type::RK4 )
        {
            state.k1.resize( mod.num_dofs() );
            state.k2.resize( mod.num_dofs() );
            state.k3.resize( mod.num_dofs() );
            state.k4.resize( mod.num_dofs() );
        }
        
        state.tmp.resize( mod.num_dofs() );
    
        state.Jx_src.resize( mod.num_dofs() );
        state.Jy_src.resize( mod.num_dofs() );
        state.Jz_src.resize( mod.num_dofs() );
        state.Jx_src_gpu.resize( mod.num_dofs() );
        state.Jy_src_gpu.resize( mod.num_dofs() );
        state.Jz_src_gpu.resize( mod.num_dofs() );
        state.Jx_src_gpu_buf.resize( mod.num_dofs() );
        state.Jy_src_gpu_buf.resize( mod.num_dofs() );
        state.Jz_src_gpu_buf.resize( mod.num_dofs() );
    
        state.bndsrcs.resize( mod.num_fluxes() );
        state.bndsrcs_decomp_cpu.resize( mod.num_fluxes() );
    
        for (auto& e : mod)
        {
            entity_data_cpu ed;
            e.populate_entity_data(ed, mod);
            entity_data_gpu edg(ed);
            state.eds.push_back( std::move(ed) );
            state.edgs.push_back( std::move(edg) );
        }
    
        build_source_compression_tables(mod, state, mpl);
    
        state.curr_time = 0.0;
        state.curr_timestep = 0;
    }
    
    void
    compress_bndsrc(const solver_state_gpu& state, const field& srcs, pinned_field& csrcs)
    {
        for (size_t i = 0; i < csrcs.num_dofs; i++)
        {
            auto from_ofs = state.bndsrcs_decomp_table_cpu[i];
            csrcs.Ex[i] = srcs.Ex[from_ofs]; 
            csrcs.Ey[i] = srcs.Ey[from_ofs]; 
            csrcs.Ez[i] = srcs.Ez[from_ofs]; 
            csrcs.Hx[i] = srcs.Hx[from_ofs]; 
            csrcs.Hy[i] = srcs.Hy[from_ofs]; 
            csrcs.Hz[i] = srcs.Hz[from_ofs]; 
        }
    }
    
    static void
    compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources)
    {
        auto currp = curr.data();
        auto nextp = next.data();
        auto dxp = state.dx.data();
        auto dyp = state.dy.data();
        auto dzp = state.dz.data();
    
        for (const auto& edg : state.edgs)
        {
            gpu_compute_field_derivatives(edg, currp.Ex, dxp.Ex, dyp.Ex, dzp.Ex, 1.0, state.compute_stream);
            gpu_compute_field_derivatives(edg, currp.Ey, dxp.Ey, dyp.Ey, dzp.Ey, 1.0, state.compute_stream);
            gpu_compute_field_derivatives(edg, currp.Ez, dxp.Ez, dyp.Ez, dzp.Ez, 1.0, state.compute_stream);
            gpu_compute_field_derivatives(edg, currp.Hx, dxp.Hx, dyp.Hx, dzp.Hx, 1.0, state.compute_stream);
            gpu_compute_field_derivatives(edg, currp.Hy, dxp.Hy, dyp.Hy, dzp.Hy, 1.0, state.compute_stream);
            gpu_compute_field_derivatives(edg, currp.Hz, dxp.Hz, dyp.Hz, dzp.Hz, 1.0, state.compute_stream);
        }
    
        auto Jx = state.Jx_src_gpu.data();
        auto Jy = state.Jy_src_gpu.data();
        auto Jz = state.Jz_src_gpu.data();
        
        if (use_sources)
        {
            gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream);
            gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream);
            gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream);
        }
        else
        {
            gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream);
            gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream);
            gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream);
        }
    
        gpu_curl(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream);
    }
    
    static void
    compute_curls_E(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
    {
        auto currp = curr.data();
        auto nextp = next.data();
        auto dxp = state.dx.data();
        auto dyp = state.dy.data();
        auto dzp = state.dz.data();
    
        for (const auto& edg : state.edgs)
        {
            gpu_compute_field_derivatives(edg, currp.Ex, dxp.Ex, dyp.Ex, dzp.Ex, 1.0, state.compute_stream);
            gpu_compute_field_derivatives(edg, currp.Ey, dxp.Ey, dyp.Ey, dzp.Ey, 1.0, state.compute_stream);
            gpu_compute_field_derivatives(edg, currp.Ez, dxp.Ez, dyp.Ez, dzp.Ez, 1.0, state.compute_stream);
        }
    
        
        gpu_curl(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream);
    }
    
    static void
    compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources)
    {
        auto currp = curr.data();
        auto nextp = next.data();
        auto dxp = state.dx.data();
        auto dyp = state.dy.data();
        auto dzp = state.dz.data();
    
        for (const auto& edg : state.edgs)
        {
            gpu_compute_field_derivatives(edg, currp.Hx, dxp.Hx, dyp.Hx, dzp.Hx, 1.0, state.compute_stream);
            gpu_compute_field_derivatives(edg, currp.Hy, dxp.Hy, dyp.Hy, dzp.Hy, 1.0, state.compute_stream);
            gpu_compute_field_derivatives(edg, currp.Hz, dxp.Hz, dyp.Hz, dzp.Hz, 1.0, state.compute_stream);
        }
    
        if (use_sources)
        {
            auto Jx = state.Jx_src_gpu.data();
            auto Jy = state.Jy_src_gpu.data();
            auto Jz = state.Jz_src_gpu.data();
            gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream);
            gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream);
            gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream);
        }
        else
        {
            gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream);
            gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream);
            gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream);
        }
    
    }
    
    static void
    compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
    {
        auto matparams_ptrs = state.matparams.data();
        double *bc_coeffs = matparams_ptrs.bc_coeffs;
    
        for (const auto& edg : state.edgs)
            maxwell::gpu_compute_jumps(edg, in, state.jumps, bc_coeffs, state.compute_stream);
    
        for (const auto& edg : state.edgs)
            maxwell::gpu_compute_fluxes(edg, state.jumps, state.fluxes, state.bndsrcs,
                state.matparams, use_sources, state.compute_stream);
    
        auto fluxesp = state.fluxes.data();
        auto outp = out.data();
    
        for (auto& edg : state.edgs)
        {
            gpu_compute_flux_lifting(edg, fluxesp.Ex, outp.Ex, state.compute_stream);
            gpu_compute_flux_lifting(edg, fluxesp.Ey, outp.Ey, state.compute_stream);
            gpu_compute_flux_lifting(edg, fluxesp.Ez, outp.Ez, state.compute_stream);
            gpu_compute_flux_lifting(edg, fluxesp.Hx, outp.Hx, state.compute_stream);
            gpu_compute_flux_lifting(edg, fluxesp.Hy, outp.Hy, state.compute_stream);
            gpu_compute_flux_lifting(edg, fluxesp.Hz, outp.Hz, state.compute_stream);
        }
    }
    
    static void
    compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
    {
        auto matparams_ptrs = state.matparams.data();
        double *bc_coeffs = matparams_ptrs.bc_coeffs;
    
        for (const auto& edg : state.edgs)
            maxwell::gpu_compute_jumps_E(edg, in, state.jumps, bc_coeffs, state.compute_stream);
    
        for (const auto& edg : state.edgs)
            maxwell::gpu_compute_fluxes_E(edg, state.jumps, state.fluxes, state.bndsrcs,
                state.matparams, use_sources, state.compute_stream);
    
        auto fluxesp = state.fluxes.data();
        auto outp = out.data();
    
        for (auto& edg : state.edgs)
        {
            gpu_compute_flux_lifting(edg, fluxesp.Hx, outp.Hx, state.compute_stream);
            gpu_compute_flux_lifting(edg, fluxesp.Hy, outp.Hy, state.compute_stream);
            gpu_compute_flux_lifting(edg, fluxesp.Hz, outp.Hz, state.compute_stream);
        }
    }
    
    static void
    compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
    {
        auto matparams_ptrs = state.matparams.data();
        double *bc_coeffs = matparams_ptrs.bc_coeffs;
    
        for (const auto& edg : state.edgs)
            maxwell::gpu_compute_jumps_H(edg, in, state.jumps, bc_coeffs, state.compute_stream);
    
        for (const auto& edg : state.edgs)
            maxwell::gpu_compute_fluxes_H(edg, state.jumps, state.fluxes, state.bndsrcs,
                state.matparams, use_sources, state.compute_stream);
    
        auto fluxesp = state.fluxes.data();
        auto outp = out.data();
    
        for (auto& edg : state.edgs)
        {
            gpu_compute_flux_lifting(edg, fluxesp.Ex, outp.Ex, state.compute_stream);
            gpu_compute_flux_lifting(edg, fluxesp.Ey, outp.Ey, state.compute_stream);
            gpu_compute_flux_lifting(edg, fluxesp.Ez, outp.Ez, state.compute_stream);
        }
    }
    
    static void
    leapfrog(solver_state_gpu& state, const parameter_loader& mpl)
    {
        double *r = state.matparams.sigma_over_epsilon.data();
        double *eps = state.matparams.inv_epsilon.data();
        double *mu = state.matparams.inv_mu.data();
    
        auto currp = state.emf_curr.data();
        auto nextp = state.emf_next.data();
        auto tmpp = state.tmp.data();
    
        bool ve = mpl.volume_sources_enabled();
        compute_curls_H(state, state.emf_curr, state.tmp, ve);
    
        auto be = mpl.boundary_sources_enabled();
        auto ie = mpl.interface_sources_enabled();
        compute_fluxes_H(state, state.emf_curr, state.tmp, be or ie);
        
        auto dt = state.delta_t;
        gpu_compute_leapfrog_update(nextp.Ex, currp.Ex, r, tmpp.Ex, eps, nextp.num_dofs,
            dt, state.compute_stream);
        gpu_compute_leapfrog_update(nextp.Ey, currp.Ey, r, tmpp.Ey, eps, nextp.num_dofs,
            dt, state.compute_stream);
        gpu_compute_leapfrog_update(nextp.Ez, currp.Ez, r, tmpp.Ez, eps, nextp.num_dofs,
            dt, state.compute_stream);
    
        compute_curls_E(state, state.emf_next, state.tmp);
        compute_fluxes_E(state, state.emf_next, state.tmp, be or ie);
    
        gpu_compute_leapfrog_update(nextp.Hx, currp.Hx, tmpp.Hx, mu, nextp.num_dofs,
            dt, state.compute_stream);
        gpu_compute_leapfrog_update(nextp.Hy, currp.Hy, tmpp.Hy, mu, nextp.num_dofs,
            dt, state.compute_stream);
        gpu_compute_leapfrog_update(nextp.Hz, currp.Hz, tmpp.Hz, mu, nextp.num_dofs,
            dt, state.compute_stream);
    }
    
    static void
    apply_operator(solver_state_gpu& state, const parameter_loader& mpl, const field_gpu& curr,
        field_gpu& next)
    {
        bool ve = mpl.volume_sources_enabled();
        auto be = mpl.boundary_sources_enabled();
        auto ie = mpl.interface_sources_enabled();
        compute_curls(state, curr, next, ve);
        compute_fluxes(state, curr, next, be or ie);
    }
    
    static void
    compute_euler_update(solver_state_gpu& state, const field_gpu& y,
        const field_gpu& k, double dt, field_gpu& out)
    {
        double *r = state.matparams.sigma_over_epsilon.data();
        double *eps = state.matparams.inv_epsilon.data();
        double *mu = state.matparams.inv_mu.data();
    
        auto outp = out.data();
        auto yp = y.data();
        auto kp = k.data();
    
        gpu_compute_euler_update(outp.Ex, yp.Ex, kp.Ex, r, eps, out.num_dofs, dt, state.compute_stream);
        gpu_compute_euler_update(outp.Ey, yp.Ey, kp.Ey, r, eps, out.num_dofs, dt, state.compute_stream);
        gpu_compute_euler_update(outp.Ez, yp.Ez, kp.Ez, r, eps, out.num_dofs, dt, state.compute_stream);
        gpu_compute_euler_update(outp.Hx, yp.Hx, kp.Hx, mu, out.num_dofs, dt, state.compute_stream);
        gpu_compute_euler_update(outp.Hy, yp.Hy, kp.Hy, mu, out.num_dofs, dt, state.compute_stream);
        gpu_compute_euler_update(outp.Hz, yp.Hz, kp.Hz, mu, out.num_dofs, dt, state.compute_stream);
    }
    
    static void
    compute_rk4_weighted_sum(solver_state_gpu& state, const field_gpu& in,
        double dt, field_gpu& out)
    {
        double *r = state.matparams.sigma_over_epsilon.data();
        double *eps = state.matparams.inv_epsilon.data();
        double *mu = state.matparams.inv_mu.data();
    
        auto inp = in.data();
        auto outp = out.data();
        auto k1p = state.k1.data();
        auto k2p = state.k2.data();
        auto k3p = state.k3.data();
        auto k4p = state.k4.data();
    
        gpu_compute_rk4_weighted_sum(outp.Ex, inp.Ex, k1p.Ex, k2p.Ex, k3p.Ex, k4p.Ex, r, eps, out.num_dofs, dt, state.compute_stream);
        gpu_compute_rk4_weighted_sum(outp.Ey, inp.Ey, k1p.Ey, k2p.Ey, k3p.Ey, k4p.Ey, r, eps, out.num_dofs, dt, state.compute_stream);
        gpu_compute_rk4_weighted_sum(outp.Ez, inp.Ez, k1p.Ez, k2p.Ez, k3p.Ez, k4p.Ez, r, eps, out.num_dofs, dt, state.compute_stream);
        gpu_compute_rk4_weighted_sum(outp.Hx, inp.Hx, k1p.Hx, k2p.Hx, k3p.Hx, k4p.Hx, mu, out.num_dofs, dt, state.compute_stream);
        gpu_compute_rk4_weighted_sum(outp.Hy, inp.Hy, k1p.Hy, k2p.Hy, k3p.Hy, k4p.Hy, mu, out.num_dofs, dt, state.compute_stream);
        gpu_compute_rk4_weighted_sum(outp.Hz, inp.Hz, k1p.Hz, k2p.Hz, k3p.Hz, k4p.Hz, mu, out.num_dofs, dt, state.compute_stream);
    }
    
    void
    timestep(const model& mod, solver_state_gpu& state, const parameter_loader& mpl, time_integrator_type ti)
    {
        //timecounter tc;
        //tc.tic();
    
        if (ti == time_integrator_type::EULER)
        {
            apply_operator(state, mpl, state.emf_curr, state.tmp);
            compute_euler_update(state, state.emf_curr, state.tmp, state.delta_t, state.emf_next);
        }
    
        if (ti == time_integrator_type::RK4)
        {
            apply_operator(state, mpl, state.emf_curr, state.k1);
    
            compute_euler_update(state, state.emf_curr, state.k1, 0.5*state.delta_t, state.tmp);
            apply_operator(state, mpl, state.tmp, state.k2);
    
            compute_euler_update(state, state.emf_curr, state.k2, 0.5*state.delta_t, state.tmp);
            apply_operator(state, mpl, state.tmp, state.k3);
            
            compute_euler_update(state, state.emf_curr, state.k3, state.delta_t, state.tmp);
            apply_operator(state, mpl, state.tmp, state.k4);
    
            compute_rk4_weighted_sum(state, state.emf_curr, state.delta_t, state.emf_next);
        }
    
        if (ti == time_integrator_type::LEAPFROG)
        {
            leapfrog(state, mpl);
        }
    
        state.curr_time += state.delta_t;
        state.curr_timestep += 1;
    
        //double time = tc.toc();
        //double dofs_per_sec = 6*state.emf_curr.num_dofs/time;
        //std::cout << "Timestep " << state.curr_timestep << ", " << dofs_per_sec << " DOFs/s" << std::endl;
    }
    
    void
    prepare_sources(const model& mod, maxwell::solver_state_gpu& state,
        maxwell::parameter_loader& mpl)
    {
        if ( mpl.boundary_sources_enabled() )
            maxwell::eval_boundary_sources_new(mod, mpl, state, state.bndsrcs_decomp_cpu);
    
        if ( mpl.interface_sources_enabled() )
            maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_decomp_cpu);
    
        compress_bndsrc(state, state.bndsrcs_decomp_cpu, state.bndsrcs_cpu);
        state.bndsrcs_buf.copyin(state.bndsrcs_cpu, state.memcpy_stream);
        state.memcpy_stream.wait();
        decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs);
    }
    
    static void
    copyin_volume_sources(const entity& e, solver_state_gpu& state)
    {
        auto ofs = e.dof_base();
        auto num = e.num_dofs();
    
        state.Jx_src_gpu_buf.copyin( state.Jx_src.data(), ofs, num, state.memcpy_stream );
        state.Jy_src_gpu_buf.copyin( state.Jy_src.data(), ofs, num, state.memcpy_stream );
        state.Jz_src_gpu_buf.copyin( state.Jz_src.data(), ofs, num, state.memcpy_stream );
    }
    
    static void
    eval_volume_sources(const model& mod, const parameter_loader& mpl, solver_state_gpu& state)
    {
        for (auto& e : mod)
        {
            auto etag = e.material_tag();
            if ( not mpl.volume_has_source(etag) )
                continue;
    
            auto fJ = [&](const point_3d& pt) -> vec3d {
                return mpl.eval_volume_source(etag, pt, state.curr_time);
            };
    
            e.project(fJ, state.Jx_src.data(), state.Jy_src.data(), state.Jz_src.data());
            copyin_volume_sources(e, state);
        }
    }
    
    void
    do_sources(const model& mod, maxwell::solver_state_gpu& state,
        maxwell::parameter_loader& mpl)
    {
        if ( mpl.source_has_changed_state() )
        {
            state.Jx_src.zero();
            state.Jy_src.zero();
            state.Jz_src.zero();
            state.Jx_src_gpu.zero();
            state.Jy_src_gpu.zero();
            state.Jz_src_gpu.zero();
            state.Jx_src_gpu_buf.zero();
            state.Jy_src_gpu_buf.zero();
            state.Jz_src_gpu_buf.zero();
            state.bndsrcs_decomp_cpu.zero();
            state.bndsrcs_cpu.zero();
            mpl.source_was_cleared();
        }
    
        auto be = mpl.boundary_sources_enabled();
        auto ie = mpl.interface_sources_enabled();
        auto ve = mpl.volume_sources_enabled();
    
    
        if ( ve )
            eval_volume_sources(mod, mpl, state);
    
        if ( be )
            eval_boundary_sources_new(mod, mpl, state, state.bndsrcs_decomp_cpu);
    
        if ( ie )
            eval_interface_sources(mod, mpl, state, state.bndsrcs_decomp_cpu);
    
        if ( be or ie )
        {
            compress_bndsrc(state, state.bndsrcs_decomp_cpu, state.bndsrcs_cpu);
            state.bndsrcs_buf.copyin(state.bndsrcs_cpu, state.memcpy_stream);
        }
    }
    
    void
    swap(solver_state_gpu& state, const parameter_loader& mpl)
    {
        /* Wait for all async copies to finish */
        checkGPU( gpuDeviceSynchronize() );
    
        auto be = mpl.boundary_sources_enabled();
        auto ie = mpl.interface_sources_enabled();
        if ( be or ie )
            decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs);
        
        std::swap(state.Jx_src_gpu, state.Jx_src_gpu_buf);
        std::swap(state.Jy_src_gpu, state.Jy_src_gpu_buf);
        std::swap(state.Jz_src_gpu, state.Jz_src_gpu_buf);
        std::swap(state.emf_curr, state.emf_next);
    
        /* Wait for decompression to finish */
        checkGPU( gpuDeviceSynchronize() );
    }
    
    #ifdef USE_MPI
    void
    gather_field(const model& mod, const maxwell::field_gpu& in, maxwell::field_gpu& f,
        int root, MPI_Comm comm)
    {}
    #endif /* USE_MPI */
    
    void
    export_fields_to_silo(const model& mod, maxwell::solver_state_gpu& state,
        const maxwell::parameter_loader& mpl, std::string basename)
    {
        /* This is supposed to be called after swap */
        if (basename == "")
            basename = "timestep";
    
        std::stringstream ss;
        ss << mpl.sim_name() << "/" << basename << "_" << state.curr_timestep << ".silo";
    
        silo sdb;
        sdb.create_db(ss.str());
        sdb.import_mesh_from_gmsh();
        sdb.write_mesh(state.curr_time, state.curr_timestep);
    
        maxwell::field f;
        f.resize( mod.num_dofs() );
        state.emf_curr.copyout(f);
    
    
        auto E_exportmode = mpl.postpro_fieldExportMode("E");
        export_vector_field(mod, sdb, f.Ex, f.Ey, f.Ez, "E", E_exportmode);
    
        auto H_exportmode = mpl.postpro_fieldExportMode("H");
        export_vector_field(mod, sdb, f.Hx, f.Hy, f.Hz, "H", H_exportmode);
    
        auto sigma = [&](int tag, const point_3d& pt) -> double { return mpl.sigma(tag, pt); };
        auto J_exportmode = mpl.postpro_fieldExportMode("J");
        export_vector_field(mod, sdb, f.Ex, f.Ey, f.Ez, sigma, "J", J_exportmode);
    }
    
    } // namespace maxwell