diff --git a/include/gpu_support.hpp b/include/gpu_support.hpp
index 4b9ca038166beafaf6a836d7ad2ed288812dd40e..e802a75f79fb981407e23ce98a8d4c1cabd13587 100644
--- a/include/gpu_support.hpp
+++ b/include/gpu_support.hpp
@@ -225,6 +225,7 @@ public:
 
         (void) checkGPU( gpuMemcpy(m_devptr, other.m_devptr, other.m_size*sizeof(T), gpuMemcpyDeviceToDevice) );
         m_size = other.m_size;
+        return *this;
     }
 
     void zero(void)
diff --git a/include/kernels_gpu.h b/include/kernels_gpu.h
index 1c81c6d51719278d8f16cea65646973cb69b006f..b61d926e3fbf1bb3c220139f51a79f10c1c27d19 100644
--- a/include/kernels_gpu.h
+++ b/include/kernels_gpu.h
@@ -148,4 +148,22 @@ gpu_compute_field_derivatives(const entity_data_gpu& edg, const double *F,
 
 void gpu_compute_flux_lifting(entity_data_gpu&, const double *, double *);
 
-void gpu_subtract(double *, const double *, const double *, size_t);
\ No newline at end of file
+void gpu_subtract(double *, const double *, const double *, size_t);
+
+void
+gpu_compute_euler_update(double *out, const double *y, const double *k,
+    const double *reaction, const double *material, size_t num_dofs, double dt);
+
+void
+gpu_compute_euler_update(double *out, const double *y, const double *k,
+    const double *material, size_t num_dofs, double dt);
+
+void
+gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1,
+    const double *k2, const double *k3, const double *k4,
+    const double *material, size_t num_dofs, double dt);
+
+void
+gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1,
+    const double *k2, const double *k3, const double *k4, const double *reaction,
+    const double *material, size_t num_dofs, double dt);
\ No newline at end of file
diff --git a/include/maxwell_interface.h b/include/maxwell_interface.h
index f946095317b53bbf6f8e54f140222b4848e547c2..c547175dba201f35209de567353967239fddeda9 100644
--- a/include/maxwell_interface.h
+++ b/include/maxwell_interface.h
@@ -163,6 +163,12 @@ struct solver_state_gpu
     field_gpu                           dx;
     field_gpu                           dy;
     field_gpu                           dz;
+
+    field_gpu                           k1;
+    field_gpu                           k2;
+    field_gpu                           k3;
+    field_gpu                           k4;
+    field_gpu                           tmp;
 };
 
 void init_from_model(const model&, solver_state&);
diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu
index da11465929dad3538f57be27107f8072254f8461..4591446a57f5426b44c01ccd5d4fd7547959b3f6 100644
--- a/src/kernels_cuda.cu
+++ b/src/kernels_cuda.cu
@@ -306,4 +306,125 @@ gpu_subtract(double *dst, const double *add, const double *sub, size_t num_dofs)
         gs += 1;
     
     gpu_subtract_kernel<<<gs, SUBTRACT_THREADS>>>(dst, add, sub, num_dofs);
+}
+
+
+void __global__
+gpu_euler_update_kernel(double * __restrict__ out, const double * __restrict__ y,
+    const double * __restrict__ k, size_t num_dofs, double dt)
+{
+    int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
+    if (dof < num_dofs)
+        out[dof] = y[dof] + dt*k[dof];
+}
+
+void __global__
+gpu_euler_update_kernel(double * __restrict__ out, const double * __restrict__ y,
+    const double * __restrict__ k, const double * __restrict__ material,
+    size_t num_dofs, double dt)
+{
+    int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
+    if (dof < num_dofs)
+        out[dof] = y[dof] + dt*k[dof]*material[dof];
+}
+
+void
+gpu_compute_euler_update(double *out, const double *y, const double *k,
+    const double *material, size_t num_dofs, double dt)
+{
+    static const size_t EULER_THREADS = 256;
+
+    auto gs = num_dofs/EULER_THREADS;
+    if (num_dofs % EULER_THREADS != 0)
+        gs += 1;
+
+    gpu_euler_update_kernel<<<gs, EULER_THREADS>>>(out, y, k, material, num_dofs, dt);
+}
+
+void __global__
+gpu_euler_update_kernel(double * __restrict__ out, const double * __restrict__ y,
+    const double * __restrict__ k, const double * __restrict__ reaction,
+    const double * __restrict__ material, size_t num_dofs, double dt)
+{
+    int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
+    if (dof < num_dofs)
+        out[dof] = y[dof]*(1-dt*reaction[dof]) + dt*k[dof]*material[dof];
+}
+
+void
+gpu_compute_euler_update(double *out, const double *y, const double *k,
+    const double *reaction, const double *material, size_t num_dofs, double dt)
+{
+    static const size_t EULER_THREADS = 256;
+
+    auto gs = num_dofs/EULER_THREADS;
+    if (num_dofs % EULER_THREADS != 0)
+        gs += 1;
+
+    gpu_euler_update_kernel<<<gs, EULER_THREADS>>>(out, y, k, reaction, material, num_dofs, dt);
+}
+
+void __global__
+gpu_rk4_weighted_sum_kernel(double * __restrict__ out, const double * __restrict__ in,
+    const double * __restrict__ k1, const double * __restrict__ k2,
+    const double * __restrict__ k3, const double * __restrict__ k4,
+    size_t num_dofs, double dt)
+{
+    int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
+    if (dof < num_dofs)
+        out[dof] = in[dof] + dt*(k1[dof] + 2*k2[dof] + 2*k3[dof] + k4[dof])/6;
+}
+
+void __global__
+gpu_rk4_weighted_sum_kernel(double * __restrict__ out, const double * __restrict__ in,
+    const double * __restrict__ k1, const double * __restrict__ k2,
+    const double * __restrict__ k3, const double * __restrict__ k4,
+    const double * __restrict__ material,
+    size_t num_dofs, double dt)
+{
+    int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
+    if (dof < num_dofs)
+        out[dof] = in[dof] + material[dof]*dt*(k1[dof] + 2*k2[dof] + 2*k3[dof] + k4[dof])/6;
+}
+
+void
+gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1,
+    const double *k2, const double *k3, const double *k4,
+    const double *material, size_t num_dofs, double dt)
+{
+    static const size_t RK4_THREADS = 256;
+
+    auto gs = num_dofs/RK4_THREADS;
+    if (num_dofs % RK4_THREADS != 0)
+        gs += 1;
+
+        gpu_rk4_weighted_sum_kernel<<<gs, RK4_THREADS>>>(out, in, k1, k2, k3, k4,
+            material, num_dofs, dt);
+}
+
+void __global__
+gpu_rk4_weighted_sum_kernel(double * __restrict__ out, const double * __restrict__ in,
+    const double * __restrict__ k1, const double * __restrict__ k2,
+    const double * __restrict__ k3, const double * __restrict__ k4,
+    const double * __restrict__ reaction, const double * __restrict__ material,
+    size_t num_dofs, double dt)
+{
+    int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
+    if (dof < num_dofs)
+        out[dof] = in[dof]*(1-dt*reaction[dof]) + material[dof]*dt*(k1[dof] + 2*k2[dof] + 2*k3[dof] + k4[dof])/6;
+}
+
+void
+gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1,
+    const double *k2, const double *k3, const double *k4, const double *reaction,
+    const double *material, size_t num_dofs, double dt)
+{
+    static const size_t RK4_THREADS = 256;
+
+    auto gs = num_dofs/RK4_THREADS;
+    if (num_dofs % RK4_THREADS != 0)
+        gs += 1;
+
+        gpu_rk4_weighted_sum_kernel<<<gs, RK4_THREADS>>>(out, in, k1, k2, k3, k4,
+            reaction, material, num_dofs, dt);
 }
\ No newline at end of file
diff --git a/src/maxwell_gpu.cpp b/src/maxwell_gpu.cpp
index b53f57e44bbfc213a3b6b4d312f2fbec97fb566e..9b20e65717e9da4dbc4bf628beb815e545fc0c58 100644
--- a/src/maxwell_gpu.cpp
+++ b/src/maxwell_gpu.cpp
@@ -16,18 +16,33 @@ void init_from_model(const model& mod, solver_state_gpu& state)
     state.dx.resize( mod.num_dofs() );
     state.dy.resize( mod.num_dofs() );
     state.dz.resize( mod.num_dofs() );
+
     state.jumps.resize( mod.num_fluxes() );
     state.fluxes.resize( mod.num_fluxes() );
 
+    std::vector<double> zeros( mod.num_dofs(), 0.0 );
     std::vector<double> onehalfs( mod.num_fluxes(), 0.5 );
+    std::vector<double> ones( mod.num_dofs(), 1.0 );
     std::vector<double> twos( mod.num_fluxes(), 2.0 );
 
+    state.matparams.inv_epsilon.copyin( ones.data(), ones.size() );
+    state.matparams.inv_mu.copyin( ones.data(), ones.size() );
+    state.matparams.sigma.copyin( zeros.data(), zeros.size() );
+    state.matparams.sigma_over_epsilon.copyin( zeros.data(), zeros.size() );
+
     state.matparams.aE.copyin( onehalfs.data(), onehalfs.size() );
     state.matparams.bE.copyin( onehalfs.data(), onehalfs.size() );
     state.matparams.aH.copyin( onehalfs.data(), onehalfs.size() );
     state.matparams.bH.copyin( onehalfs.data(), onehalfs.size() );
     state.matparams.bc_coeffs.copyin( twos.data(), twos.size() );
 
+    state.k1.resize( mod.num_dofs() );
+    state.k2.resize( mod.num_dofs() );
+    state.k3.resize( mod.num_dofs() );
+    state.k4.resize( mod.num_dofs() );
+    state.tmp.resize( mod.num_dofs() );
+
+
     for (auto& e : mod)
     {
         entity_data_cpu ed;
@@ -36,6 +51,9 @@ void init_from_model(const model& mod, solver_state_gpu& state)
         state.eds.push_back( std::move(ed) );
         state.edgs.push_back( std::move(edg) );
     }
+
+    state.curr_time = 0.0;
+    state.curr_timestep = 0;
 }
 
 
@@ -95,19 +113,84 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
 void
 apply_operator(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
 {
-    timecounter_gpu tc;
-    tc.tic();
-    
     compute_curls(state, curr, next);
     compute_fluxes(state, curr, next);
+}
 
-    double time = tc.toc();
-    std::cout << curr.num_dofs/time << " DOF/s" << std::endl;
+void
+compute_euler_update(solver_state_gpu& state, const field_gpu& y,
+    const field_gpu& k, double dt, field_gpu& out)
+{
+    double *r = state.matparams.sigma_over_epsilon.data();
+    double *eps = state.matparams.inv_epsilon.data();
+    double *mu = state.matparams.inv_mu.data();
+
+    auto outp = out.data();
+    auto yp = y.data();
+    auto kp = k.data();
+
+    gpu_compute_euler_update(outp.Ex, yp.Ex, kp.Ex, r, eps, out.num_dofs, dt);
+    gpu_compute_euler_update(outp.Ey, yp.Ey, kp.Ey, r, eps, out.num_dofs, dt);
+    gpu_compute_euler_update(outp.Ez, yp.Ez, kp.Ez, r, eps, out.num_dofs, dt);
+    gpu_compute_euler_update(outp.Hx, yp.Hx, kp.Hx, mu, out.num_dofs, dt);
+    gpu_compute_euler_update(outp.Hy, yp.Hy, kp.Hy, mu, out.num_dofs, dt);
+    gpu_compute_euler_update(outp.Hz, yp.Hz, kp.Hz, mu, out.num_dofs, dt);
+}
+
+void
+compute_rk4_weighted_sum(solver_state_gpu& state, const field_gpu& in,
+    double dt, field_gpu& out)
+{
+    double *r = state.matparams.sigma_over_epsilon.data();
+    double *eps = state.matparams.inv_epsilon.data();
+    double *mu = state.matparams.inv_mu.data();
+
+    auto inp = in.data();
+    auto outp = out.data();
+    auto k1p = state.k1.data();
+    auto k2p = state.k2.data();
+    auto k3p = state.k3.data();
+    auto k4p = state.k4.data();
+
+    gpu_compute_rk4_weighted_sum(outp.Ex, inp.Ex, k1p.Ex, k2p.Ex, k3p.Ex, k4p.Ex, r, eps, out.num_dofs, dt);
+    gpu_compute_rk4_weighted_sum(outp.Ey, inp.Ey, k1p.Ey, k2p.Ey, k3p.Ey, k4p.Ey, r, eps, out.num_dofs, dt);
+    gpu_compute_rk4_weighted_sum(outp.Ez, inp.Ez, k1p.Ez, k2p.Ez, k3p.Ez, k4p.Ez, r, eps, out.num_dofs, dt);
+    gpu_compute_rk4_weighted_sum(outp.Hx, inp.Hx, k1p.Hx, k2p.Hx, k3p.Hx, k4p.Hx, mu, out.num_dofs, dt);
+    gpu_compute_rk4_weighted_sum(outp.Hy, inp.Hy, k1p.Hy, k2p.Hy, k3p.Hy, k4p.Hy, mu, out.num_dofs, dt);
+    gpu_compute_rk4_weighted_sum(outp.Hz, inp.Hz, k1p.Hz, k2p.Hz, k3p.Hz, k4p.Hz, mu, out.num_dofs, dt);
 }
 
 void
 timestep(solver_state_gpu& state)
-{}
+{
+    timecounter_gpu tc;
+    tc.tic();
+
+    /*
+    apply_operator(state, state.emf_curr, state.tmp);
+    compute_euler_update(state, state.emf_curr, state.tmp, state.delta_t, state.emf_next);
+    */
+
+    apply_operator(state, state.emf_curr, state.k1);
+
+    compute_euler_update(state, state.emf_curr, state.k1, 0.5*state.delta_t, state.tmp);
+    apply_operator(state, state.tmp, state.k2);
+
+    compute_euler_update(state, state.emf_curr, state.k2, 0.5*state.delta_t, state.tmp);
+    apply_operator(state, state.tmp, state.k3);
+        
+    compute_euler_update(state, state.emf_curr, state.k3, state.delta_t, state.tmp);
+    apply_operator(state, state.tmp, state.k4);
+
+    compute_rk4_weighted_sum(state, state.emf_curr, state.delta_t, state.emf_next);
+
+    state.curr_time += state.delta_t;
+    state.curr_timestep += 1;
+
+    double time = tc.toc();
+    double dofs_per_sec = 6*state.emf_curr.num_dofs/time;
+    std::cout << "Timestep " << state.curr_timestep << ", " << dofs_per_sec << " DOFs/s" << std::endl;
+}
 
 
 void
diff --git a/src/maxwell_solver.cpp b/src/maxwell_solver.cpp
index c4d300119a555b118af48aa71d655eb693ddb58c..e8c6208ba97e85d91e3a193f9e6dbf46aa3e49cf 100644
--- a/src/maxwell_solver.cpp
+++ b/src/maxwell_solver.cpp
@@ -41,11 +41,10 @@ void test_it(const model& mod, State& state)
 
     maxwell::init_from_model(mod, state);
     maxwell::init_E_field(mod, state, E);
-    //maxwell::apply_operator(state, state.emf_curr, state.emf_next);
 
     state.delta_t = 0.001;
 
-    for(size_t i = 0; i < 1000; i++)
+    for(size_t i = 0; i < 10000; i++)
     {
         timestep(state);
         std::stringstream ss;