Added Kokkos test snippets.

551bb093 · Matteo Cicuttin · 551bb093 · 551bb093 · 551bb093 · 551bb093
Commit 551bb093 authored 5 years ago by Matteo Cicuttin
--- a/kokkos-testing/CMakeLists.txt
+++ b/kokkos-testing/CMakeLists.txt
+project(kokkos-testing)
+cmake_minimum_required(VERSION 3.10)
+
+add_subdirectory(kokkos)
+
+add_executable(finite_difference_kokkos finite_difference_kokkos.cpp)
+target_link_libraries(finite_difference_kokkos Kokkos::kokkos siloh5)
+
+add_executable(finite_difference_baseline finite_difference_baseline.cpp)
+target_link_libraries(finite_difference_baseline siloh5)
+ 
+
--- a/kokkos-testing/README.md
+++ b/kokkos-testing/README.md
+Testing code for Kokkos/OpenACC/CUDA
+
+You have to
+
+    git clone https://github.com/kokkos/kokkos.git
+
+before compiling anything
+
--- a/kokkos-testing/finite_difference_baseline.cpp
+++ b/kokkos-testing/finite_difference_baseline.cpp
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <sys/time.h>
+#include <vector>
+#include <cmath>
+#include <chrono>
+
+#include <silo.h>
+
+template<typename T>
+class fd_grid
+{
+    std::vector<T>  m_data;
+    size_t m_rows, m_cols;
+
+public:
+    fd_grid()
+    {}
+
+    fd_grid(size_t rows, size_t cols) :
+        m_data(rows*cols), m_rows(rows), m_cols(cols)
+    {}
+
+    fd_grid(fd_grid&& other)
+    {
+        std::swap(m_data, other.m_data);
+    }
+
+    fd_grid& operator=(fd_grid&& other)
+    {
+        std::swap(m_data, other.m_data);
+        return *this;
+    }
+
+    T operator()(size_t i, size_t j) const
+    {
+        return m_data[i*m_cols + j];
+    }
+
+    T& operator()(size_t i, size_t j)
+    {
+        return m_data[i*m_cols + j];
+    }
+
+    size_t rows() const { return m_rows; }
+    size_t cols() const { return m_cols; }
+
+    T* data() { return m_data.data(); }
+    const T* data() const { return m_data.data(); }
+};
+
+int
+visit_dump(const fd_grid<double>& g, const std::string& fn)
+{
+    DBfile *db = nullptr;
+    db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "Kokkos test", DB_HDF5);
+    if (!db)
+    {
+        std::cout << "Cannot write simulation output" << std::endl;
+        return -1;
+    }
+
+    auto size_x = g.cols();
+    auto size_y = g.rows();
+    
+    std::vector<double> x(size_x);
+    std::vector<double> y(size_y);
+    
+    for (size_t i = 0; i < size_x; i++)
+        x.at(i) = double(i)/(size_x-1);
+        
+    for (size_t i = 0; i < size_y; i++)
+        y.at(i) = double(i)/(size_y-1);
+        
+    int dims[] = { int(size_y), int(size_y) };
+    int ndims = 2;
+    double *coords[] = {x.data(), y.data()};
+    
+    DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_DOUBLE, DB_COLLINEAR, NULL);
+        
+    DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_DOUBLE, DB_NODECENT, NULL);
+    
+    DBClose(db);
+    return 0;
+}
+
+//#define HIGH_ORDER
+
+#ifdef HIGH_ORDER
+    #define HALO_SIZE 4
+#else
+    #define HALO_SIZE 1
+#endif
+
+int main( int argc, char* argv[] )
+{
+    using namespace std::chrono;
+
+    int     Nx          = 1024;
+    int     Ny          = 1024;
+    int     halo        = HALO_SIZE;
+
+    int     rows        = Ny + 2*halo;
+    int     cols        = Nx + 2*halo;
+    int     timesteps   = 500;
+    double  dx          = 1./(Nx-1);
+    double  dy          = 1./(Ny-1);
+    double  dt          = 0.0001;
+
+    double  velocity    = 1;
+    double  damping     = 0.1;
+
+    auto c2 = velocity*velocity;
+    auto dx2 = dx*dx;
+    auto dy2 = dy*dy;
+    auto dt2 = dt*dt;
+
+    auto D1 = 1./(1. + 0.5*damping*dt);
+    auto D2 = 0.5*damping*dt - 1;
+
+    typedef fd_grid<double>  matrix_view_type;
+
+    matrix_view_type u_prev(rows, cols);
+    matrix_view_type u_curr(rows, cols);
+    matrix_view_type u_next(rows, cols);
+
+    for (int i = 0; i < rows; i++)
+    {
+        for (int j = 0; j < cols; j++)
+        {
+            if ( i < halo or i > rows-halo or j < halo or j > cols-halo)
+            {
+                u_prev(i,j) = 0.0;
+                u_curr(i,j) = 0.0;
+            }
+            else
+            {
+                double x = dx*i - 0.5;
+                double y = dy*j - 0.1;
+                u_prev(i,j) = -std::exp(-2400*(x*x + y*y));
+                u_curr(i,j) = 2*dt*u_prev(i,j);
+            }
+        }
+    }
+
+    double iter_time = 0.0;
+
+    double w0 = -205.0/72.0;
+    double w1 = 8.0/5.0;
+    double w2 = -1.0/5.0;
+    double w3 = 8.0/315.0;
+    double w4 = -1.0/560.0;
+    double w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
+
+    for (int ts = 0; ts < timesteps; ts++)
+    {
+        auto t_begin = high_resolution_clock::now();
+
+        for (size_t i = halo; i < rows - halo; i++)
+        {
+            for (size_t j = halo; j < cols - halo; j++)
+            {
+                /* 2nd order in space */
+#ifndef HIGH_ORDER
+                u_next(i,j) = D1*(2.0*u_curr(i,j) + D2*u_prev(i,j) +
+                        + (c2*dt2/dy2)*(u_curr(i+1,j) - 2.0*u_curr(i,j) + u_curr(i-1,j))
+                        + (c2*dt2/dx2)*(u_curr(i,j+1) - 2.0*u_curr(i,j) + u_curr(i,j-1)));
+
+                if (j == 1)
+                    u_next(i,j-1) = 0;//u_next(i,j);
+
+                if (j == cols-2)
+                    u_next(i,j+1) = 0;//u_next(i,j);
+
+                if (i == 1)
+                    u_next(i-1,j) = 0;//u_next(i,j);
+
+                if (i == rows-2)
+                    u_next(i+1,j) = 0;//u_next(i,j);
+#else
+
+                /* 8th order in space */
+                double deriv_x = 0.0;
+                double deriv_y = 0.0;
+                for (int k = -4; k <= 4; k++)
+                {
+                    deriv_x += w[k+4]*u_curr(i+k,j);
+                    deriv_y += w[k+4]*u_curr(i,j+k);
+                }
+                u_next(i,j) = D1*(2.0*u_curr(i,j) + D2*u_prev(i,j) +
+                        + (c2*dt2/dy2)*(deriv_y)
+                        + (c2*dt2/dx2)*(deriv_x));
+
+                if (j == halo)
+                    for (size_t k = 0; k < halo; k++)
+                        u_next(i,k) = 0;
+
+                if (j == cols-(halo+1))
+                    for (size_t k = 0; k < halo; k++)
+                        u_next(i,j+k+1) = 0;
+
+                if (i == 1)
+                    for (size_t k = 0; k < halo; k++)
+                        u_next(k,j) = 0;
+
+                if (i == rows-(halo+1))
+                    for (size_t k = 0; k < halo; k++)
+                        u_next(i+k+1,j) = 0;
+#endif
+            }
+        }
+
+        std::swap(u_prev, u_curr);
+        std::swap(u_curr, u_next);
+
+        auto t_end = high_resolution_clock::now();
+
+        duration<double> time_span = duration_cast<duration<double>>(t_end - t_begin);
+        iter_time += time_span.count();
+
+        /*
+        if ( (ts % 100) == 0 )
+        {
+            std::stringstream ss;
+            ss << "solution_" << ts << ".silo";
+            visit_dump(u_curr, ss.str());
+        }
+        */
+    }
+
+    std::cout << "Average iteration time: " << iter_time/timesteps << std::endl;
+
+    return 0;
+}
+
--- a/kokkos-testing/finite_difference_kokkos.cpp
+++ b/kokkos-testing/finite_difference_kokkos.cpp
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <cmath>
+#include <chrono>
+#include <type_traits>
+
+#include <silo.h>
+#include <Kokkos_Core.hpp>
+
+template<typename T>
+int
+visit_dump(const Kokkos::View<T**>& kv, const std::string& fn)
+{
+    static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value,
+                  "Only double or float");
+
+    DBfile *db = nullptr;
+    db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "Kokkos test", DB_HDF5);
+    if (!db)
+    {
+        std::cout << "Cannot write simulation output" << std::endl;
+        return -1;
+    }
+
+    auto size_x = kv.extent(0);
+    auto size_y = kv.extent(1);
+    
+    std::vector<T> x(size_x);
+    std::vector<T> y(size_y);
+    
+    for (size_t i = 0; i < size_x; i++)
+        x.at(i) = T(i)/(size_x-1);
+        
+    for (size_t i = 0; i < size_y; i++)
+        y.at(i) = T(i)/(size_y-1);
+        
+    int dims[] = { int(size_y), int(size_y) };
+    int ndims = 2;
+    T *coords[] = { x.data(), y.data() };
+    
+    if (std::is_same<T,float>::value)
+        DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims,
+                      DB_FLOAT, DB_COLLINEAR, NULL);
+
+    if (std::is_same<T,double>::value)
+        DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims,
+                      DB_DOUBLE, DB_COLLINEAR, NULL);
+    
+    std::vector<T> data(x.size() * y.size());
+    
+    for (size_t i = 0; i < x.size(); i++)
+        for (size_t j = 0; j < y.size(); j++)
+            data.at(i*y.size()+j) = kv(i,j);
+    
+    if (std::is_same<T,float>::value)
+        DBPutQuadvar1(db, "solution", "mesh", data.data(), dims, ndims,
+                      NULL, 0, DB_FLOAT, DB_NODECENT, NULL);
+    
+    if (std::is_same<T,double>::value)
+        DBPutQuadvar1(db, "solution", "mesh", data.data(), dims, ndims,
+                      NULL, 0, DB_DOUBLE, DB_NODECENT, NULL);
+
+    DBClose(db);
+    return 0;
+}
+
+#define HIGH_ORDER
+
+#ifdef HIGH_ORDER
+    #define HALO_SIZE 4
+#else
+    #define HALO_SIZE 1
+#endif
+
+int main(int argc, char *argv[])
+{
+    using T = double;
+    using namespace Kokkos;
+    using namespace std::chrono;
+
+    int     Nx          = 1024;
+    int     Ny          = 1024;
+    int     halo        = HALO_SIZE;
+
+    int     rows        = Ny + 2*halo;
+    int     cols        = Nx + 2*halo;
+    int     timesteps   = 5000;
+    T       dx          = 1./(Nx-1);
+    T       dy          = 1./(Ny-1);
+    T       dt          = 0.0001;
+
+    T       velocity    = 1;
+    T       damping     = 0.1;
+
+    auto c2     = velocity*velocity;
+    auto dx2    = dx*dx;
+    auto dy2    = dy*dy;
+    auto dt2    = dt*dt;
+
+    auto D1 = 1./(1. + 0.5*damping*dt);
+    auto D2 = 0.5*damping*dt - 1;
+
+    Kokkos::initialize( argc, argv );
+
+    {
+        typedef View<T**> matrix_view_type;
+
+        matrix_view_type u_prev("u_next", rows, cols);
+        matrix_view_type u_curr("u_curr", rows, cols);
+        matrix_view_type u_next("u_next", rows, cols);
+
+        for (int i = 0; i < rows; i++)
+        {
+            for (int j = 0; j < cols; j++)
+            {
+                if ( i < halo or i > rows-halo or j < halo or j > cols-halo)
+                {
+                    u_prev(i,j) = 0.0;
+                    u_curr(i,j) = 0.0;
+                }
+                else
+                {
+                    T x = dx*i - 0.5;
+                    T y = dy*j - 0.1;
+                    u_prev(i,j) = -std::exp(-2400*(x*x + y*y));
+                    u_curr(i,j) = 2*dt*u_prev(i,j);
+                }
+            }
+        }
+
+        MDRangePolicy<Rank<2>> range({halo,halo}, {rows-(halo+1), cols-(halo+1)});
+
+#ifdef HIGH_ORDER
+        static const T w0 = -205.0/72.0;
+        static const T w1 =    8.0/5.0;
+        static const T w2 =   -1.0/5.0;
+        static const T w3 =    8.0/315.0;
+        static const T w4 =   -1.0/560.0;
+        static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
+#endif
+
+        double iter_time = 0.0;
+        for (int ts = 0; ts < timesteps; ts++)
+        {
+            auto t_begin = high_resolution_clock::now();
+
+            parallel_for(range, KOKKOS_LAMBDA(int i, int j)
+            {
+#ifndef HIGH_ORDER
+                u_next(i,j) = D1 * (
+                            2.0*u_curr(i,j)
+                            + D2*u_prev(i,j) +
+                            + (c2*dt2/dy2) * 
+                                (u_curr(i+1,j) - 2.0*u_curr(i,j) + u_curr(i-1,j))
+                            + (c2*dt2/dx2) *
+                                (u_curr(i,j+1) - 2.0*u_curr(i,j) + u_curr(i,j-1))
+                            );
+
+                if (j == 1)
+                    u_next(i,j-1) = 0
+
+                if (j == cols-2)
+                    u_next(i,j+1) = 0;
+
+                if (i == 1)
+                    u_next(i-1,j) = 0;
+
+                if (i == rows-2)
+                    u_next(i+1,j) = 0;
+#else
+                /* 8th order in space */
+                T deriv_x = 0.0;
+                T deriv_y = 0.0;
+                for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
+                {
+                    deriv_x += w[k+HALO_SIZE]*u_curr(i+k,j);
+                    deriv_y += w[k+HALO_SIZE]*u_curr(i,j+k);
+                }
+                u_next(i,j) = D1*(2.0*u_curr(i,j) + D2*u_prev(i,j) +
+                        + (c2*dt2/dy2)*(deriv_y)
+                        + (c2*dt2/dx2)*(deriv_x));
+
+                if (j == halo)
+                    for (size_t k = 0; k < halo; k++)
+                        u_next(i,k) = 0;
+
+                if (j == cols-(halo+1))
+                    for (size_t k = 0; k < halo; k++)
+                        u_next(i,j+k+1) = 0;
+
+                if (i == 1)
+                    for (size_t k = 0; k < halo; k++)
+                        u_next(k,j) = 0;
+
+                if (i == rows-(halo+1))
+                    for (size_t k = 0; k < halo; k++)
+                        u_next(i+k+1,j) = 0;
+#endif
+            });
+            
+            std::swap(u_prev, u_curr);
+            std::swap(u_curr, u_next);
+
+            auto t_end = high_resolution_clock::now();
+
+            duration<double> time_span =
+                duration_cast<duration<double>>(t_end - t_begin);
+            iter_time += time_span.count();
+
+            
+            if ( (ts % 100) == 0 )
+            {
+                std::stringstream ss;
+                ss << "solution_" << ts << ".silo";
+                visit_dump(u_curr, ss.str());
+            }
+            
+        }
+
+        double avg_iter_time = iter_time/timesteps;
+        std::cout << "Average iteration time: " << avg_iter_time << std::endl;
+    }
+
+    Kokkos::finalize();
+
+    return 0;
+}
+
--- a/kokkos-testing/finite_differences.cu
+++ b/kokkos-testing/finite_differences.cu
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstdio>
+#include <cassert>
+#include <silo.h>
+
+#define DEBUG
+
+#define SINGLE_PRECISION
+
+template<typename T>
+class fd_grid
+{
+    std::vector<T>  m_data;
+    size_t m_rows, m_cols;
+
+public:
+    fd_grid()
+    {}
+
+    fd_grid(size_t rows, size_t cols) :
+        m_data(rows*cols), m_rows(rows), m_cols(cols)
+    {}
+
+    fd_grid(fd_grid&& other)
+    {
+        std::swap(m_data, other.m_data);
+    }
+
+    fd_grid& operator=(fd_grid&& other)
+    {
+        std::swap(m_data, other.m_data);
+        return *this;
+    }
+
+    T operator()(size_t i, size_t j) const
+    {
+        return m_data[i*m_cols + j];
+    }
+
+    T& operator()(size_t i, size_t j)
+    {
+        return m_data[i*m_cols + j];
+    }
+
+    size_t rows() const { return m_rows; }
+    size_t cols() const { return m_cols; }
+
+    T* data() { return m_data.data(); }
+    const T* data() const { return m_data.data(); }
+};
+
+template<typename T>
+int
+visit_dump(const fd_grid<T>& g, const std::string& fn)
+{
+    static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, "Only double or float");
+
+    DBfile *db = nullptr;
+    db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "Kokkos test", DB_HDF5);
+    if (!db)
+    {
+        std::cout << "Cannot write simulation output" << std::endl;
+        return -1;
+    }
+
+    auto size_x = g.cols();
+    auto size_y = g.rows();
+    
+    std::vector<T> x(size_x);
+    std::vector<T> y(size_y);
+    
+    for (size_t i = 0; i < size_x; i++)
+        x.at(i) = T(i)/(size_x-1);
+        
+    for (size_t i = 0; i < size_y; i++)
+        y.at(i) = T(i)/(size_y-1);
+        
+    int dims[] = { int(size_y), int(size_y) };
+    int ndims = 2;
+    
+    T *coords[] = {x.data(), y.data()};
+    
+    if (std::is_same<T,float>::value)
+    {
+        DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_FLOAT, DB_COLLINEAR, NULL);    
+        DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_FLOAT, DB_NODECENT, NULL);
+    }
+    else
+    {
+        DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_DOUBLE, DB_COLLINEAR, NULL);    
+        DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_DOUBLE, DB_NODECENT, NULL);
+    }
+
+    DBClose(db);
+    return 0;
+}
+
+
+
+inline cudaError_t checkCuda(cudaError_t result)
+{
+#if defined(DEBUG) || defined(_DEBUG)
+  if (result != cudaSuccess) {
+    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
+    assert(result == cudaSuccess);
+  }
+#endif
+  return result;
+}
+
+#ifdef SINGLE_PRECISION
+__constant__ float d_w[9];
+#else
+__constant__ double d_w[9];
+#endif
+
+template<typename T>
+__global__ void test_kernel(T* data)
+{
+    int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
+    int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
+
+    int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
+
+    data[ofs] = gridDim.x * blockIdx.y + blockIdx.x;
+}
+
+
+#define     GRID_ROWS       32
+#define     GRID_COLS       32
+#define     HALO_SIZE       4
+
+int grid_rows;
+int grid_cols;
+
+template<typename T>
+__global__ void x_kernel(/*T* u_prev, */T* u_curr, T* u_next)
+{
+    __shared__ T s_curr[GRID_COLS+2*HALO_SIZE];
+
+    /* Get X/Y coordinates of this grid point */
+    int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
+    int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
+
+    /* Get offset of this grid point in the fd grid array */
+    int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
+
+    int shm_ofs = threadIdx.x + HALO_SIZE;
+
+    s_curr[shm_ofs] = u_curr[ofs];
+
+    __syncthreads();
+    
+    
+    if (threadIdx.x < 4)
+    {
+        s_curr[threadIdx.x] = 0.0;
+        s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0;
+    }
+
+    __syncthreads();
+    
+    T dx = 0.0;
+    for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
+        dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k];
+
+    u_next[ofs] = dx;
+}
+
+#if 0
+template<typename T>
+__global__ void y_kernel(/*T* u_prev, */T* u_curr, T* u_next)
+{
+    __shared__ T s_curr[2*HALO_SIZE][32];
+
+    /* Get X/Y coordinates of this grid point */
+    int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
+    int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
+
+    /* Get offset of this grid point in the fd grid array */
+    int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
+
+    int shm_ofs_x = threadIdx.x;
+    int shm_ofs_y = threadIdx.y;
+
+    s_curr[shm_ofs_x][shm_ofs_y] = u_curr[ofs];
+
+    __syncthreads();
+    
+    
+    if (threadIdx.y < 4)
+    {
+        s_curr[threadIdx.x] = 0.0;
+        s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0;
+    }
+
+    __syncthreads();
+    
+    T dx = 0.0;
+    for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
+        dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k];
+
+    u_next[ofs] = dx;
+}
+#endif
+int main(void)
+{
+
+#ifdef SINGLE_PRECISION
+    using T = float;
+#else
+    using T = double;
+#endif
+
+    /**** Get device properties ****/
+    cudaDeviceProp prop;
+    checkCuda( cudaGetDeviceProperties(&prop, 0) );
+    printf("Device Name: %s\n", prop.name);
+    printf("Compute Capability: %d.%d\n\n", prop.major, prop.minor);
+
+    /**** Initialize constants ****/
+    T w0 = -205.0/72.0;
+    T w1 = 8.0/5.0;
+    T w2 = -1.0/5.0;
+    T w3 = 8.0/315.0;
+    T w4 = -1.0/560.0;
+    T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
+    checkCuda( cudaMemcpyToSymbol(d_w, w, 9*sizeof(T), 0, cudaMemcpyHostToDevice) );
+
+    /**** Prepare for timing ****/
+    float milliseconds;
+    cudaEvent_t startEvent, stopEvent;
+    checkCuda( cudaEventCreate(&startEvent) );
+    checkCuda( cudaEventCreate(&stopEvent) );
+
+    /**** Prepare grid and copy to device ****/
+    fd_grid<T> g(GRID_ROWS, GRID_COLS);
+    for (size_t i = 0; i < GRID_ROWS; i++)
+        for (size_t j = 0; j < GRID_COLS; j++)
+            g(i,j) = std::sin(M_PI*T(i)/GRID_ROWS)*std::sin(M_PI*T(j)/GRID_COLS);
+            //g(i,j) = (T(j)/GRID_COLS)*(T(j)/GRID_COLS);
+
+    T* u_curr;
+    T* u_next;
+
+    checkCuda( cudaMalloc((void**)&u_curr, GRID_ROWS*GRID_COLS*sizeof(T)) );
+    checkCuda( cudaMalloc((void**)&u_next, GRID_ROWS*GRID_COLS*sizeof(T)) );
+    checkCuda( cudaMemcpy(u_curr, g.data(), GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyHostToDevice) );
+
+    dim3 gs(1,GRID_ROWS);
+    dim3 tpb(GRID_COLS,1);
+
+    checkCuda( cudaEventRecord(startEvent, 0) );
+    //for (size_t i = 0; i < 1; i++)
+        x_kernel<<<gs, tpb>>>(u_curr, u_next);
+    checkCuda( cudaEventRecord(stopEvent, 0) );
+    checkCuda( cudaEventSynchronize(stopEvent) );
+    checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
+
+    std::cout << "Kernel execution time: " << milliseconds << " ms." << std::endl;
+
+    checkCuda( cudaDeviceSynchronize() );
+
+    checkCuda( cudaMemcpy(g.data(), u_next, GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyDeviceToHost) );
+    
+    checkCuda( cudaFree(u_next) );
+    checkCuda( cudaFree(u_curr) );
+
+    visit_dump(g, "cuda.silo");
+
+    return 0;
+}
+
+
+
+