diff --git a/entity.cpp b/entity.cpp
index e8cad719a1ae7b246350f72e90a8c764957a6178..a06c7430102f1695991f2ccaea72e37df79e84b2 100644
--- a/entity.cpp
+++ b/entity.cpp
@@ -258,6 +258,7 @@ entity::populate_entity_data(entity_data_cpu<1>& ed) const
 {
     assert(g_order == 1);
     assert(reference_cells.size() > 0);
+    ed.a_order = a_order;
     ed.num_elems = num_elements_per_orientation();
     ed.num_orientations = num_orientations();
     ed.num_bf = reference_cells[0].num_basis_functions();
diff --git a/entity_data.h b/entity_data.h
index 130ce394efb3290982793e9f9191e3883f725aa0..0352a1a561fd642422c589a75f3469af0a302142 100644
--- a/entity_data.h
+++ b/entity_data.h
@@ -8,6 +8,7 @@ struct entity_data_cpu;
 template<>
 struct entity_data_cpu<1>
 {
+    int                 a_order;
     std::vector<size_t> num_elems;          /* No. of elements in the entity */
     size_t              num_orientations;   /* No. of bf. orientations */
     size_t              num_bf;             /* No. of dofs per elem */
diff --git a/kernels_cpu.cpp b/kernels_cpu.cpp
index 1fe17ea9a4bc1c89c5124da5e8e80e88f0b5f78f..e8bec0f861f27916c9e682b968a0e8b648e45cfe 100644
--- a/kernels_cpu.cpp
+++ b/kernels_cpu.cpp
@@ -1,47 +1,34 @@
 #include "types.h"
 #include "entity_data.h"
+#include "kernels_cpu.h"
 
 /* Field derivatives kernel, case for elements with planar faces */
 void compute_field_derivatives(const entity_data_cpu<1>& ed,
     const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz)
 {
-    /* The elements must be ordered by orientation -> entity::sort_by_orientation() */
-    size_t orient_elem_base = 0;
-    for (size_t iO = 0; iO < ed.num_orientations; iO++)
+    switch (ed.a_order)
     {
-        size_t num_elems = ed.num_elems[iO];
-        size_t orient_base = orient_elem_base * ed.num_bf;
+        case 1:
+            compute_field_derivatives_kernel<1>(ed, f, df_dx, df_dy, df_dz);
+            break;
 
-        for (size_t iT = 0; iT < num_elems; iT++)
-        {
-            size_t ent_iT = orient_elem_base + iT;
-            size_t dofofs = ed.dofs_base + orient_base + iT*ed.num_bf;
+        case 2:
+            compute_field_derivatives_kernel<2>(ed, f, df_dx, df_dy, df_dz);
+            break;
 
-            for (size_t odof = 0; odof < ed.num_bf; odof++)
-            {
-                double acc_df_dx = 0.0;
-                double acc_df_dy = 0.0;
-                double acc_df_dz = 0.0;
+        case 3:
+            compute_field_derivatives_kernel<3>(ed, f, df_dx, df_dy, df_dz);
+            break;
 
-                for (size_t refsys_dir = 0; refsys_dir < 3; refsys_dir++)
-                {
-                    for (size_t idof = 0; idof < ed.num_bf; idof++)
-                    {
-                        auto dm_row = iO*ed.num_bf + odof;
-                        auto dm_col = refsys_dir*ed.num_bf + idof;
-                        double D = ed.differentiation_matrix(dm_row, dm_col);
-                        acc_df_dx += ed.jacobians(3*ent_iT + 0, refsys_dir) * D * f(dofofs+idof);
-                        acc_df_dy += ed.jacobians(3*ent_iT + 1, refsys_dir) * D * f(dofofs+idof);
-                        acc_df_dz += ed.jacobians(3*ent_iT + 2, refsys_dir) * D * f(dofofs+idof);
-                    }
-                }
+        case 4:
+            compute_field_derivatives_kernel<4>(ed, f, df_dx, df_dy, df_dz);
+            break;
+        
+        case 5:
+            compute_field_derivatives_kernel<5>(ed, f, df_dx, df_dy, df_dz);
+            break;
 
-                df_dx(dofofs+odof) = acc_df_dx;
-                df_dy(dofofs+odof) = acc_df_dy;
-                df_dz(dofofs+odof) = acc_df_dz;
-            }
-        }
-
-        orient_elem_base += num_elems;
+        default:
+            std::cout << "compute_field_derivatives: invalid order" << std::endl;
     }
 }
\ No newline at end of file
diff --git a/kernels_cpu.h b/kernels_cpu.h
index 08b5abd085b5ec60a12e318892543f1581902d9e..339bbe9b3c41d73d658a02c00312f3741d84a12d 100644
--- a/kernels_cpu.h
+++ b/kernels_cpu.h
@@ -1,5 +1,76 @@
 #pragma once
 
-void compute_field_derivatives(const entity_data_cpu<1>& ed,
-    const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz);
+template<size_t AK>
+struct num_dofs_3D
+{
+    static const size_t value = ((AK+3)*(AK+2)*(AK+1))/6;
+};
+
+template<size_t AK>
+struct num_dofs_2D
+{
+    static const size_t value = ((AK+2)*(AK+1))/2;
+};
+
+template<size_t AK>
+struct kernel_cpu_sizes
+{
+    static const size_t num_bf = num_dofs_3D<AK>::value;
+    static const size_t num_fluxes = num_dofs_2D<AK>::value;
+    static const size_t num_faces = 4;
+    static const size_t num_all_fluxes = num_fluxes * num_faces;
+};
+
+
+template<size_t AK>
+void compute_field_derivatives_kernel(const entity_data_cpu<1>& ed,
+    const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz)
+{
+    /* Flop count: 3*ed.num_bf*3*ed.num_bf*num_total_elems */
+    /* The elements must be ordered by orientation -> entity::sort_by_orientation() */
+
+    using KS = kernel_cpu_sizes<AK>;
+    assert(ed.num_bf == KS::num_bf);
+
+    size_t orient_elem_base = 0;
+    for (size_t iO = 0; iO < ed.num_orientations; iO++)
+    {
+        size_t num_elems = ed.num_elems[iO];
+        size_t orient_base = orient_elem_base * ed.num_bf;
 
+        for (size_t iT = 0; iT < num_elems; iT++)
+        {
+            size_t ent_iT = orient_elem_base + iT;
+            size_t dofofs = ed.dofs_base + orient_base + iT*ed.num_bf;
+
+            for (size_t odof = 0; odof < KS::num_bf; odof++)
+            {
+                double acc_df_dx = 0.0;
+                double acc_df_dy = 0.0;
+                double acc_df_dz = 0.0;
+
+                for (size_t refsys_dir = 0; refsys_dir < 3; refsys_dir++)
+                {
+                    for (size_t idof = 0; idof < KS::num_bf; idof++)
+                    {
+                        auto dm_row = iO*ed.num_bf + odof;
+                        auto dm_col = refsys_dir*ed.num_bf + idof;
+                        double D = ed.differentiation_matrix(dm_row, dm_col);
+                        acc_df_dx += ed.jacobians(3*ent_iT + 0, refsys_dir) * D * f(dofofs+idof);
+                        acc_df_dy += ed.jacobians(3*ent_iT + 1, refsys_dir) * D * f(dofofs+idof);
+                        acc_df_dz += ed.jacobians(3*ent_iT + 2, refsys_dir) * D * f(dofofs+idof);
+                    }
+                }
+
+                df_dx(dofofs+odof) = acc_df_dx;
+                df_dy(dofofs+odof) = acc_df_dy;
+                df_dz(dofofs+odof) = acc_df_dz;
+            }
+        }
+
+        orient_elem_base += num_elems;
+    }
+}
+
+void compute_field_derivatives(const entity_data_cpu<1>& ed,
+    const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz);
\ No newline at end of file
diff --git a/test_differentiation.cpp b/test_differentiation.cpp
index 74331e3d00781562322a066358311bc8695838b1..987345998d283c7f62c2e45bf84d80a5c706afb4 100644
--- a/test_differentiation.cpp
+++ b/test_differentiation.cpp
@@ -8,6 +8,7 @@
 #include "sgr.hpp"
 #include "entity_data.h"
 #include "kernels_cpu.h"
+#include "timecounter.h"
 
 using namespace sgr;
 
@@ -65,7 +66,13 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
         entity_data_cpu<1> ed;
         e0.populate_entity_data(ed);
 
+        timecounter tc;
+        tc.tic();
         compute_field_derivatives(ed, Pf_e0, df_dx_e0, df_dy_e0, df_dz_e0);
+        double time = tc.toc();
+        std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
+        size_t flops = 3*ed.num_bf*3*ed.num_bf*e0.num_elements();
+        std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
 
         vecxd Pdf_dx_e0 = e0.project(df_dx);
         vecxd Pdf_dy_e0 = e0.project(df_dy);
diff --git a/timecounter.h b/timecounter.h
new file mode 100644
index 0000000000000000000000000000000000000000..1654558be55ba78ac79810ee5424d68d03d898ff
--- /dev/null
+++ b/timecounter.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <chrono>
+
+#ifdef HAVE_CUDA_RT
+#include <cuda_runtime.h>
+#endif
+
+class timecounter
+{
+    std::chrono::time_point<std::chrono::steady_clock>    begin, end;
+
+public:
+    timecounter()
+    {}
+
+    void tic()
+    {
+        begin = std::chrono::steady_clock::now();
+    }
+
+    double toc()
+    {
+        end = std::chrono::steady_clock::now();
+        std::chrono::duration<double> diff = end-begin;
+        return diff.count();
+    }
+};
+
+#ifdef HAVE_CUDA_RT
+class timecounter_cuda
+{
+    cudaEvent_t start;
+    cudaEvent_t stop;
+
+public:
+    timecounter_cuda()
+    {
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+    }
+
+    ~timecounter_cuda()
+    {
+        cudaEventDestroy(start);
+        cudaEventDestroy(stop);
+    }
+
+    void tic()
+    {
+        cudaEventRecord(start, 0);
+    }
+
+    double toc()
+    {
+        cudaEventRecord(stop, 0);
+
+        float elapsed;
+        cudaEventSynchronize(stop);
+        cudaEventElapsedTime(&elapsed, start, stop);
+        return elapsed/1000.0;
+    }
+};
+#endif